diff --git a/.devops/main-intel.Dockerfile b/.devops/main-intel.Dockerfile deleted file mode 100644 index e1e6acc24..000000000 --- a/.devops/main-intel.Dockerfile +++ /dev/null @@ -1,26 +0,0 @@ -ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04 -ARG UBUNTU_VERSION=22.04 - -FROM intel/hpckit:$ONEAPI_VERSION as build - -RUN apt-get update && \ - apt-get install -y git - -WORKDIR /app - -COPY . . - -# for some reasons, "-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DLLAMA_NATIVE=ON" give worse performance -RUN mkdir build && \ - cd build && \ - cmake .. -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx && \ - cmake --build . --config Release --target main server - -FROM ubuntu:$UBUNTU_VERSION as runtime - -COPY --from=build /app/build/bin/main /main -COPY --from=build /app/build/bin/server /server - -ENV LC_ALL=C.utf8 - -ENTRYPOINT [ "/main" ] diff --git a/.devops/server-intel.Dockerfile b/.devops/server-intel.Dockerfile deleted file mode 100644 index e343d278c..000000000 --- a/.devops/server-intel.Dockerfile +++ /dev/null @@ -1,25 +0,0 @@ -ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04 -ARG UBUNTU_VERSION=22.04 - -FROM intel/hpckit:$ONEAPI_VERSION as build - -RUN apt-get update && \ - apt-get install -y git - -WORKDIR /app - -COPY . . - -# for some reasons, "-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DLLAMA_NATIVE=ON" give worse performance -RUN mkdir build && \ - cd build && \ - cmake .. -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx && \ - cmake --build . --config Release --target main server - -FROM ubuntu:$UBUNTU_VERSION as runtime - -COPY --from=build /app/build/bin/server /server - -ENV LC_ALL=C.utf8 - -ENTRYPOINT [ "/server" ] diff --git a/README-sycl.md b/README-sycl.md index 2b2cfe03a..7aa4274a9 100644 --- a/README-sycl.md +++ b/README-sycl.md @@ -1,22 +1,15 @@ # llama.cpp for SYCL -[Background](#background) - -[OS](#os) - -[Intel GPU](#intel-gpu) - -[Linux](#linux) - -[Windows](#windows) - -[Environment Variable](#environment-variable) - -[Known Issue](#known-issue) - -[Q&A](#q&a) - -[Todo](#todo) +- [Background](#background) +- [OS](#os) +- [Intel GPU](#intel-gpu) +- [Docker](#docker) +- [Linux](#linux) +- [Windows](#windows) +- [Environment Variable](#environment-variable) +- [Known Issue](#known-issue) +- [Q&A](#q&a) +- [Todo](#todo) ## Background @@ -36,20 +29,65 @@ For Intel CPU, recommend to use llama.cpp for X86 (Intel MKL building). |OS|Status|Verified| |-|-|-| -|Linux|Support|Ubuntu 22.04| +|Linux|Support|Ubuntu 22.04, Fedora Silverblue 39| |Windows|Support|Windows 11| ## Intel GPU +### Verified + |Intel GPU| Status | Verified Model| |-|-|-| |Intel Data Center Max Series| Support| Max 1550| |Intel Data Center Flex Series| Support| Flex 170| |Intel Arc Series| Support| Arc 770, 730M| |Intel built-in Arc GPU| Support| built-in Arc GPU in Meteor Lake| -|Intel iGPU| Support| iGPU in i5-1250P, i7-1165G7| +|Intel iGPU| Support| iGPU in i5-1250P, i7-1260P, i7-1165G7| +Note: If the EUs (Execution Unit) in iGPU is less than 80, the inference speed will be too slow to use. + +### Memory + +The memory is a limitation to run LLM on GPUs. + +When run llama.cpp, there is print log to show the applied memory on GPU. You could know how much memory to be used in your case. Like `llm_load_tensors: buffer size = 3577.56 MiB`. + +For iGPU, please make sure the shared memory from host memory is enough. For llama-2-7b.Q4_0, recommend the host memory is 8GB+. + +For dGPU, please make sure the device memory is enough. For llama-2-7b.Q4_0, recommend the device memory is 4GB+. + +## Docker + +Note: +- Only docker on Linux is tested. Docker on WSL may not work. +- You may need to install Intel GPU driver on the host machine (See the [Linux](#linux) section to know how to do that) + +### Build the image + +You can choose between **F16** and **F32** build. F16 is faster for long-prompt inference. + + +```sh +# For F16: +#docker build -t llama-cpp-sycl --build-arg="LLAMA_SYCL_F16=ON" -f .devops/main-intel.Dockerfile . + +# Or, for F32: +docker build -t llama-cpp-sycl -f .devops/main-intel.Dockerfile . + +# Note: you can also use the ".devops/main-server.Dockerfile", which compiles the "server" example +``` + +### Run + +```sh +# Firstly, find all the DRI cards: +ls -la /dev/dri +# Then, pick the card that you want to use. + +# For example with "/dev/dri/card1" +docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-sycl -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 +``` ## Linux @@ -63,7 +101,7 @@ Note: for iGPU, please install the client GPU driver. b. Add user to group: video, render. -``` +```sh sudo usermod -aG render username sudo usermod -aG video username ``` @@ -72,7 +110,7 @@ Note: re-login to enable it. c. Check -``` +```sh sudo apt install clinfo sudo clinfo -l ``` @@ -90,7 +128,6 @@ Platform #0: Intel(R) OpenCL HD Graphics 2. Install Intel® oneAPI Base toolkit. - a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html). Recommend to install to default folder: **/opt/intel/oneapi**. @@ -99,13 +136,13 @@ Following guide use the default folder as example. If you use other folder, plea b. Check -``` +```sh source /opt/intel/oneapi/setvars.sh sycl-ls ``` -There should be one or more level-zero devices. Like **[ext_oneapi_level_zero:gpu:0]**. +There should be one or more level-zero devices. Please confirm that at least one GPU is present, like **[ext_oneapi_level_zero:gpu:0]**. Output (example): ``` @@ -118,21 +155,25 @@ Output (example): 2. Build locally: -``` +Note: +- You can choose between **F16** and **F32** build. F16 is faster for long-prompt inference. +- By default, it will build for all binary files. It will take more time. To reduce the time, we recommend to build for **example/main** only. + +```sh mkdir -p build cd build source /opt/intel/oneapi/setvars.sh -#for FP16 -#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON # faster for long-prompt inference +# For FP16: +#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON -#for FP32 +# Or, for FP32: cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -#build example/main only +# Build example/main only #cmake --build . --config Release --target main -#build all binary +# Or, build all binary cmake --build . --config Release -v cd .. @@ -140,18 +181,16 @@ cd .. or -``` +```sh ./examples/sycl/build.sh ``` -Note: - -- By default, it will build for all binary files. It will take more time. To reduce the time, we recommend to build for **example/main** only. - ### Run 1. Put model file to folder **models** +You could download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) as example. + 2. Enable oneAPI running environment ``` @@ -162,10 +201,10 @@ source /opt/intel/oneapi/setvars.sh Run without parameter: -``` +```sh ./build/bin/ls-sycl-device -or +# or running the "main" executable and look at the output log: ./build/bin/main ``` @@ -194,13 +233,13 @@ found 4 SYCL devices: Set device ID = 0 by **GGML_SYCL_DEVICE=0** -``` +```sh GGML_SYCL_DEVICE=0 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 ``` or run by script: -``` -./examples/sycl/run-llama2.sh +```sh +./examples/sycl/run_llama2.sh ``` Note: @@ -223,7 +262,13 @@ Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device Please install Intel GPU driver by official guide: [Install GPU Drivers](https://www.intel.com/content/www/us/en/products/docs/discrete-gpus/arc/software/drivers.html). -2. Install Intel® oneAPI Base toolkit. +Note: **The driver is mandatory for compute function**. + +2. Install Visual Studio. + +Please install [Visual Studio](https://visualstudio.microsoft.com/) which impact oneAPI environment enabling in Windows. + +3. Install Intel® oneAPI Base toolkit. a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html). @@ -252,7 +297,7 @@ In oneAPI command line: sycl-ls ``` -There should be one or more level-zero devices. Like **[ext_oneapi_level_zero:gpu:0]**. +There should be one or more level-zero devices. Please confirm that at least one GPU is present, like **[ext_oneapi_level_zero:gpu:0]**. Output (example): ``` @@ -260,15 +305,21 @@ Output (example): [opencl:cpu:1] Intel(R) OpenCL, 11th Gen Intel(R) Core(TM) i7-1185G7 @ 3.00GHz OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000] [opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Iris(R) Xe Graphics OpenCL 3.0 NEO [31.0.101.5186] [ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Iris(R) Xe Graphics 1.3 [1.3.28044] - ``` -3. Install cmake & make +4. Install cmake & make -a. Download & install cmake for windows: https://cmake.org/download/ +a. Download & install cmake for Windows: https://cmake.org/download/ -b. Download & install make for windows provided by mingw-w64: https://www.mingw-w64.org/downloads/ +b. Download & install make for Windows provided by mingw-w64 +- Download binary package for Windows in https://github.com/niXman/mingw-builds-binaries/releases. + + Like [x86_64-13.2.0-release-win32-seh-msvcrt-rt_v11-rev1.7z](https://github.com/niXman/mingw-builds-binaries/releases/download/13.2.0-rt_v11-rev1/x86_64-13.2.0-release-win32-seh-msvcrt-rt_v11-rev1.7z). + +- Unzip the binary package. In the **bin** sub-folder and rename **xxx-make.exe** to **make.exe**. + +- Add the **bin** folder path in the Windows system PATH environment. ### Build locally: @@ -309,6 +360,8 @@ Note: 1. Put model file to folder **models** +You could download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) as example. + 2. Enable oneAPI running environment - In Search, input 'oneAPI'. @@ -405,7 +458,7 @@ Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device llama.cpp use mmap as default way to read model file and copy to GPU. In some system, memcpy will be abnormal and block. - Solution: add **--no-mmap**. + Solution: add **--no-mmap** or **--mmap 0**. ## Q&A @@ -419,8 +472,25 @@ Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device Miss to enable oneAPI running environment. +- Meet compile error. + + Remove folder **build** and try again. + +- I can **not** see **[ext_oneapi_level_zero:gpu:0]** afer install GPU driver in Linux. + + Please run **sudo sycl-ls**. + + If you see it in result, please add video/render group to your ID: + + ``` + sudo usermod -aG render username + sudo usermod -aG video username + ``` + + Then **relogin**. + + If you do not see it, please check the installation GPU steps again. + ## Todo -- Support to build in Windows. - - Support multiple cards. diff --git a/common/common.cpp b/common/common.cpp index c9c7d664e..34e62967c 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -516,7 +516,7 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { invalid_param = true; break; } - params.lora_adapter.push_back(std::make_tuple(argv[i], 1.0f)); + params.lora_adapter.emplace_back(argv[i], 1.0f); params.use_mmap = false; } else if (arg == "--lora-scaled") { if (++i >= argc) { @@ -528,7 +528,7 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { invalid_param = true; break; } - params.lora_adapter.push_back(std::make_tuple(lora_adapter, std::stof(argv[i]))); + params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i])); params.use_mmap = false; } else if (arg == "--lora-base") { if (++i >= argc) { @@ -584,20 +584,20 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { break; } params.n_gpu_layers = std::stoi(argv[i]); -#ifndef LLAMA_SUPPORTS_GPU_OFFLOAD - fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n"); - fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); -#endif + if (!llama_supports_gpu_offload()) { + fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n"); + fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); + } } else if (arg == "--gpu-layers-draft" || arg == "-ngld" || arg == "--n-gpu-layers-draft") { if (++i >= argc) { invalid_param = true; break; } params.n_gpu_layers_draft = std::stoi(argv[i]); -#ifndef LLAMA_SUPPORTS_GPU_OFFLOAD - fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n"); - fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); -#endif + if (!llama_supports_gpu_offload()) { + fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n"); + fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); + } } else if (arg == "--main-gpu" || arg == "-mg") { if (++i >= argc) { invalid_param = true; @@ -638,11 +638,11 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { const std::regex regex{R"([,/]+)"}; std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1}; std::vector split_arg{it, {}}; - if (split_arg.size() >= LLAMA_MAX_DEVICES) { + if (split_arg.size() >= llama_max_devices()) { invalid_param = true; break; } - for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) { + for (size_t i = 0; i < llama_max_devices(); ++i) { if (i < split_arg.size()) { params.tensor_split[i] = std::stof(split_arg[i]); } else { @@ -665,7 +665,7 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { invalid_param = true; break; } - params.antiprompt.push_back(argv[i]); + params.antiprompt.emplace_back(argv[i]); } else if (arg == "-ld" || arg == "--logdir") { if (++i >= argc) { invalid_param = true; @@ -881,7 +881,7 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { } if (!params.kv_overrides.empty()) { - params.kv_overrides.emplace_back(llama_model_kv_override()); + params.kv_overrides.emplace_back(); params.kv_overrides.back().key[0] = 0; } @@ -990,30 +990,30 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n"); printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA. see examples/llava/README.md\n"); printf(" --image IMAGE_FILE path to an image file. use with multimodal models\n"); - if (llama_mlock_supported()) { + if (llama_supports_mlock()) { printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n"); } - if (llama_mmap_supported()) { + if (llama_supports_mmap()) { printf(" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n"); } printf(" --numa attempt optimizations that help on some NUMA systems\n"); printf(" if run without this previously, it is recommended to drop the system page cache before using this\n"); printf(" see https://github.com/ggerganov/llama.cpp/issues/1437\n"); -#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD - printf(" -ngl N, --n-gpu-layers N\n"); - printf(" number of layers to store in VRAM\n"); - printf(" -ngld N, --n-gpu-layers-draft N\n"); - printf(" number of layers to store in VRAM for the draft model\n"); - printf(" -sm SPLIT_MODE, --split-mode SPLIT_MODE\n"); - printf(" how to split the model across multiple GPUs, one of:\n"); - printf(" - none: use one GPU only\n"); - printf(" - layer (default): split layers and KV across GPUs\n"); - printf(" - row: split rows across GPUs\n"); - printf(" -ts SPLIT, --tensor-split SPLIT\n"); - printf(" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n"); - printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n"); - printf(" or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu); -#endif // LLAMA_SUPPORTS_GPU_OFFLOAD + if (llama_supports_gpu_offload()) { + printf(" -ngl N, --n-gpu-layers N\n"); + printf(" number of layers to store in VRAM\n"); + printf(" -ngld N, --n-gpu-layers-draft N\n"); + printf(" number of layers to store in VRAM for the draft model\n"); + printf(" -sm SPLIT_MODE, --split-mode SPLIT_MODE\n"); + printf(" how to split the model across multiple GPUs, one of:\n"); + printf(" - none: use one GPU only\n"); + printf(" - layer (default): split layers and KV across GPUs\n"); + printf(" - row: split rows across GPUs\n"); + printf(" -ts SPLIT, --tensor-split SPLIT\n"); + printf(" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n"); + printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n"); + printf(" or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu); + } printf(" --verbose-prompt print a verbose prompt before generation (default: %s)\n", params.verbose_prompt ? "true" : "false"); printf(" --no-display-prompt don't print prompt at generation (default: %s)\n", !params.display_prompt ? "true" : "false"); printf(" -gan N, --grp-attn-n N\n"); @@ -1652,7 +1652,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false"); fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp); - const std::vector tensor_split_vector(params.tensor_split, params.tensor_split + LLAMA_MAX_DEVICES); + const std::vector tensor_split_vector(params.tensor_split, params.tensor_split + llama_max_devices()); dump_vector_float_yaml(stream, "tensor_split", tensor_split_vector); fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z); diff --git a/common/common.h b/common/common.h index 308f16930..9d5ff39e0 100644 --- a/common/common.h +++ b/common/common.h @@ -37,40 +37,39 @@ int32_t get_num_physical_cores(); struct gpt_params { - uint32_t seed = -1; // RNG seed + uint32_t seed = -1; // RNG seed - int32_t n_threads = get_num_physical_cores(); - int32_t n_threads_draft = -1; - int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads) - int32_t n_threads_batch_draft = -1; - int32_t n_predict = -1; // new tokens to predict - int32_t n_ctx = 512; // context size - int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS) - int32_t n_keep = 0; // number of tokens to keep from initial prompt - int32_t n_draft = 8; // number of tokens to draft during speculative decoding - int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited) - int32_t n_parallel = 1; // number of parallel sequences to decode - int32_t n_sequences = 1; // number of sequences to decode - float p_accept = 0.5f; // speculative decoding accept probability - float p_split = 0.1f; // speculative decoding split probability - int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default) - int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default) - llama_split_mode split_mode = LLAMA_SPLIT_LAYER; // how to split the model across GPUs - int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors - float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs - int32_t n_beams = 0; // if non-zero then use beam search of given width. - int32_t grp_attn_n = 1; // group-attention factor - int32_t grp_attn_w = 512; // group-attention width - int32_t n_print = -1; // print token count every n tokens (-1 = disabled) - float rope_freq_base = 0.0f; // RoPE base frequency - float rope_freq_scale = 0.0f; // RoPE frequency scaling factor - float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor - float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor - float yarn_beta_fast = 32.0f; // YaRN low correction dim - float yarn_beta_slow = 1.0f; // YaRN high correction dim - int32_t yarn_orig_ctx = 0; // YaRN original context length - int8_t rope_scaling_type = LLAMA_ROPE_SCALING_UNSPECIFIED; // TODO: better to be int32_t for alignment - // pinging @cebtenzzre + int32_t n_threads = get_num_physical_cores(); + int32_t n_threads_draft = -1; + int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads) + int32_t n_threads_batch_draft = -1; + int32_t n_predict = -1; // new tokens to predict + int32_t n_ctx = 512; // context size + int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS) + int32_t n_keep = 0; // number of tokens to keep from initial prompt + int32_t n_draft = 8; // number of tokens to draft during speculative decoding + int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited) + int32_t n_parallel = 1; // number of parallel sequences to decode + int32_t n_sequences = 1; // number of sequences to decode + float p_accept = 0.5f; // speculative decoding accept probability + float p_split = 0.1f; // speculative decoding split probability + int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default) + int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default) + llama_split_mode split_mode = LLAMA_SPLIT_LAYER; // how to split the model across GPUs + int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors + float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs + int32_t n_beams = 0; // if non-zero then use beam search of given width. + int32_t grp_attn_n = 1; // group-attention factor + int32_t grp_attn_w = 512; // group-attention width + int32_t n_print = -1; // print token count every n tokens (-1 = disabled) + float rope_freq_base = 0.0f; // RoPE base frequency + float rope_freq_scale = 0.0f; // RoPE frequency scaling factor + float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor + float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor + float yarn_beta_fast = 32.0f; // YaRN low correction dim + float yarn_beta_slow = 1.0f; // YaRN high correction dim + int32_t yarn_orig_ctx = 0; // YaRN original context length + int32_t rope_scaling_type = LLAMA_ROPE_SCALING_UNSPECIFIED; // sampling parameters int32_t top_k = 40; // <= 0 to use vocab size diff --git a/common/train.cpp b/common/train.cpp index e6f2f7a2f..e4c3d5df6 100644 --- a/common/train.cpp +++ b/common/train.cpp @@ -1363,12 +1363,12 @@ bool consume_common_train_arg( *invalid_param = true; return true; } -#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD - params->n_gpu_layers = std::stoi(argv[i]); -#else - fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n"); - fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); -#endif + if (llama_supports_gpu_offload()) { + params->n_gpu_layers = std::stoi(argv[i]); + } else { + fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n"); + fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); + } } else if (arg == "-h" || arg == "--help") { params->print_usage = true; return true; diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 6ab7f486e..a6ffd128b 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -203,6 +203,8 @@ class Model: return CodeShellModel if model_architecture == "OrionForCausalLM": return OrionModel + if model_architecture == "InternLM2ForCausalLM": + return InternLM2Model return Model def _is_model_safetensors(self) -> bool: @@ -254,6 +256,8 @@ class Model: return gguf.MODEL_ARCH.CODESHELL if arch == "OrionForCausalLM": return gguf.MODEL_ARCH.ORION + if arch == "InternLM2ForCausalLM": + return gguf.MODEL_ARCH.INTERNLM2 raise NotImplementedError(f'Architecture "{arch}" not supported!') @@ -1134,7 +1138,7 @@ class GPT2Model(Model): for name, data_torch in self.get_tensors(): # we don't need these - if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq", ".attn.bias")): + if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq", ".attn.bias", ".attn.masked_bias")): continue if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_proj.weight")): @@ -1344,6 +1348,154 @@ class CodeShellModel(Model): self.gguf_writer.add_tensor("output.weight", data) print(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}") + +class InternLM2Model(Model): + def set_vocab(self): + # (TODO): Is there a better way? + # Copy from _set_vocab_sentencepiece, The only difference is that we will treat the character + # \x00 specially and convert it into an emoji character to prevent it from being mistakenly + # recognized as an empty string in C++. + from sentencepiece import SentencePieceProcessor + from sentencepiece import sentencepiece_model_pb2 as model + + tokenizer_path = self.dir_model / 'tokenizer.model' + + tokens: list[bytes] = [] + scores: list[float] = [] + toktypes: list[int] = [] + + if not tokenizer_path.is_file(): + print(f'Error: Missing {tokenizer_path}', file=sys.stderr) + sys.exit(1) + + sentencepiece_model = model.ModelProto() + sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) + add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix + + tokenizer = SentencePieceProcessor(str(tokenizer_path)) + vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) + + for token_id in range(vocab_size): + piece = tokenizer.id_to_piece(token_id) + text = piece.encode("utf-8") + score = tokenizer.get_score(token_id) + if text == b"\x00": + # (TODO): fixme + # Hack here and replace the \x00 characters. + print(f"InternLM2 convert token '{text}' to '🐉'!") + text = "🐉" + + toktype = SentencePieceTokenTypes.NORMAL + if tokenizer.is_unknown(token_id): + toktype = SentencePieceTokenTypes.UNKNOWN + elif tokenizer.is_control(token_id): + toktype = SentencePieceTokenTypes.CONTROL + elif tokenizer.is_unused(token_id): + toktype = SentencePieceTokenTypes.UNUSED + elif tokenizer.is_byte(token_id): + toktype = SentencePieceTokenTypes.BYTE + + tokens.append(text) + scores.append(score) + toktypes.append(toktype) + + added_tokens_file = self.dir_model / 'added_tokens.json' + if added_tokens_file.is_file(): + with open(added_tokens_file, "r", encoding="utf-8") as f: + added_tokens_json = json.load(f) + + for key in added_tokens_json: + tokens.append(key.encode("utf-8")) + scores.append(-1000.0) + toktypes.append(SentencePieceTokenTypes.USER_DEFINED) + + self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + self.gguf_writer.add_add_space_prefix(add_prefix) + + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + special_vocab.add_to_gguf(self.gguf_writer) + + def set_gguf_parameters(self): + self.gguf_writer.add_name("InternLM2") + self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) + self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"]) + self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) + self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) + self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"]) + self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) + self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"]) + + def post_write_tensors(self, tensor_map, name, data_torch): + old_dtype = data_torch.dtype + + # convert any unsupported data types to float32 + if data_torch.dtype not in (torch.float16, torch.float32): + data_torch = data_torch.to(torch.float32) + + data = data_torch.squeeze().numpy() + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) + if new_name is None: + print(f"Can not map tensor {name!r}") + sys.exit() + + n_dims = len(data.shape) + data_dtype = data.dtype + + # if f32 desired, convert any float16 to float32 + if self.ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + + # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 + if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + data = data.astype(np.float16) + + print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + self.gguf_writer.add_tensor(new_name, data) + + def write_tensors(self): + from einops import rearrange + + num_heads = self.hparams.get("num_attention_heads") + num_kv_heads = self.hparams.get("num_key_value_heads") + hidden_size = self.hparams.get("hidden_size") + q_per_kv = num_heads // num_kv_heads + head_dim = hidden_size // num_heads + num_groups = num_heads // q_per_kv + + block_count = self.hparams["num_hidden_layers"] + model_kv = dict(self.get_tensors()) + tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) + qkv_pattern = r"model\.layers\.(\d+)\.attention\.wqkv" + for name, data_torch in model_kv.items(): + # we don't need these + if name.endswith(".rotary_emb.inv_freq"): + continue + + if re.match(qkv_pattern, name): + bid = re.findall(qkv_pattern, name)[0] + qkv = data_torch + qkv = rearrange(qkv.T, " o (g n i) ->o g n i", g=num_groups, n=q_per_kv + 2, i=head_dim) + q, k, v = qkv[..., : q_per_kv, :], qkv[..., q_per_kv: q_per_kv + 1, :], qkv[..., q_per_kv + 1: q_per_kv + 2, :] + q = rearrange(q, " o g n i -> o (g n i)").T + k = rearrange(k, " o g n i -> o (g n i)").T + v = rearrange(v, " o g n i -> o (g n i)").T + self.post_write_tensors(tensor_map, f"model.layers.{bid}.attention.wq.weight", q) + self.post_write_tensors(tensor_map, f"model.layers.{bid}.attention.wk.weight", k) + self.post_write_tensors(tensor_map, f"model.layers.{bid}.attention.wv.weight", v) + else: + self.post_write_tensors(tensor_map, name, data_torch) + + ###### CONVERSION LOGIC ###### diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp index 7924db267..b52d68457 100644 --- a/examples/batched-bench/batched-bench.cpp +++ b/examples/batched-bench/batched-bench.cpp @@ -88,7 +88,7 @@ int main(int argc, char ** argv) { llama_model_params model_params = llama_model_default_params(); - const std::vector t_split (LLAMA_MAX_DEVICES, 0.0f); + const std::vector t_split(llama_max_devices(), 0.0f); model_params.n_gpu_layers = n_gpu_layers; model_params.tensor_split = t_split.data(); diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index e9b49317f..edb789da4 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -37,6 +37,8 @@ public: void set_parameters(StatParams&& params) { m_params = std::move(params); } bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data); void save_imatrix() const; + bool load_imatrix(const char * file_name, bool add); + static bool load_imatrix(const char * file_name, std::unordered_map& imatrix); private: std::unordered_map m_stats; StatParams m_params; @@ -190,6 +192,57 @@ void IMatrixCollector::save_imatrix(const char * fname) const { } } +bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_map& imatrix_data) { + std::ifstream in(imatrix_file, std::ios::binary); + if (!in) { + printf("%s: failed to open %s\n",__func__,imatrix_file); + return false; + } + int n_entries; + in.read((char*)&n_entries, sizeof(n_entries)); + if (in.fail() || n_entries < 1) { + printf("%s: no data in file %s\n", __func__, imatrix_file); + return false; + } + for (int i = 0; i < n_entries; ++i) { + int len; in.read((char *)&len, sizeof(len)); + std::vector name_as_vec(len+1); + in.read((char *)name_as_vec.data(), len); + if (in.fail()) { + printf("%s: failed reading name for entry %d from %s\n",__func__,i+1,imatrix_file); + return false; + } + name_as_vec[len] = 0; + std::string name{name_as_vec.data()}; + auto& e = imatrix_data[std::move(name)]; + int ncall; + in.read((char*)&ncall, sizeof(ncall)); + int nval; + in.read((char *)&nval, sizeof(nval)); + if (in.fail() || nval < 1) { + printf("%s: failed reading number of values for entry %d\n",__func__,i); + imatrix_data = {}; + return false; + } + e.values.resize(nval); + in.read((char*)e.values.data(), nval*sizeof(float)); + if (in.fail()) { + printf("%s: failed reading data for entry %d\n",__func__,i); + imatrix_data = {}; + return false; + } + e.ncall = ncall; + } + return true; +} + +bool IMatrixCollector::load_imatrix(const char * file_name, bool add) { + if (!add) { + m_stats.clear(); + } + return load_imatrix(file_name, m_stats); +} + static IMatrixCollector g_collector; static bool ik_collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) { @@ -270,7 +323,7 @@ static void process_logits( } } -static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool compute_ppl) { +static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool compute_ppl, int from_chunk) { const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); const int n_ctx = llama_n_ctx(ctx); @@ -283,6 +336,15 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool auto tim2 = std::chrono::high_resolution_clock::now(); fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast(tim2-tim1).count()); + if (from_chunk > 0) { + if (size_t((from_chunk + 2)*n_ctx) >= tokens.size()) { + fprintf(stderr, "%s: there will be not enough tokens left after removing %d chunks\n", __func__, from_chunk); + return false; + } + fprintf(stderr, "%s: removing initial %d chunks (%d tokens)\n", __func__, from_chunk, from_chunk*n_ctx); + tokens.erase(tokens.begin(), tokens.begin() + from_chunk*n_ctx); + } + if (int(tokens.size()) < 2*n_ctx) { fprintf(stderr, "%s: you need at least %d tokens for a context of %d tokens\n",__func__,2*n_ctx, n_ctx); @@ -403,7 +465,10 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool int main(int argc, char ** argv) { StatParams sparams; + std::string prev_result_file; + std::string combine_files; bool compute_ppl = true; + int from_chunk = 0; std::vector args; args.push_back(argv[0]); int iarg = 1; @@ -424,6 +489,13 @@ int main(int argc, char ** argv) { compute_ppl = false; } else if (arg == "--keep-imatrix") { sparams.keep_every = std::stoi(argv[++iarg]); + } else if (arg == "--continue-from") { + prev_result_file = argv[++iarg]; + } else if (arg == "--combine") { + combine_files = argv[++iarg]; + } + else if (arg == "--from-chunk") { + from_chunk = std::stoi(argv[++iarg]); } else { args.push_back(argv[iarg]); } @@ -437,14 +509,50 @@ int main(int argc, char ** argv) { } } + g_collector.set_parameters(std::move(sparams)); + + if (!combine_files.empty()) { + std::vector files; + size_t pos = 0; + while (true) { + auto new_pos = combine_files.find(',', pos); + if (new_pos != std::string::npos) { + files.emplace_back(combine_files.substr(pos, new_pos - pos)); + pos = new_pos + 1; + } else { + files.emplace_back(combine_files.substr(pos)); + break; + } + } + if (files.size() < 2) { + fprintf(stderr, "You must provide at least two comma separated files to use --combine\n"); + return 1; + } + printf("Combining the following %d files\n", int(files.size())); + for (auto& file : files) { + printf(" %s\n", file.c_str()); + if (!g_collector.load_imatrix(file.c_str(), true)) { + fprintf(stderr, "Failed to load %s\n", file.c_str()); + return 1; + } + } + g_collector.save_imatrix(); + return 0; + } + + if (!prev_result_file.empty()) { + if (!g_collector.load_imatrix(prev_result_file.c_str(), false)) { + fprintf(stderr, "=============== Failed to load %s\n", prev_result_file.c_str()); + return 1; + } + } + gpt_params params; params.n_batch = 512; if (!gpt_params_parse(args.size(), args.data(), params)) { return 1; } - g_collector.set_parameters(std::move(sparams)); - params.logits_all = true; params.n_batch = std::min(params.n_batch, params.n_ctx); @@ -496,7 +604,7 @@ int main(int argc, char ** argv) { fprintf(stderr, "%s\n", get_system_info(params).c_str()); } - bool OK = compute_imatrix(ctx, params, compute_ppl); + bool OK = compute_imatrix(ctx, params, compute_ppl, from_chunk); if (!OK) { return 1; } diff --git a/examples/llama-bench/README.md b/examples/llama-bench/README.md index d02824bfa..374e40a7d 100644 --- a/examples/llama-bench/README.md +++ b/examples/llama-bench/README.md @@ -23,19 +23,23 @@ usage: ./llama-bench [options] options: -h, --help - -m, --model (default: models/7B/ggml-model-q4_0.gguf) - -p, --n-prompt (default: 512) - -n, --n-gen (default: 128) - -b, --batch-size (default: 512) - --memory-f32 <0|1> (default: 0) - -t, --threads (default: 16) - -ngl N, --n-gpu-layers (default: 99) - -mg i, --main-gpu (default: 0) - -mmq, --mul-mat-q <0|1> (default: 1) - -ts, --tensor_split - -r, --repetitions (default: 5) - -o, --output (default: md) - -v, --verbose (default: 0) + -m, --model (default: models/7B/ggml-model-q4_0.gguf) + -p, --n-prompt (default: 512) + -n, --n-gen (default: 128) + -b, --batch-size (default: 512) + -ctk , --cache-type-k (default: f16) + -ctv , --cache-type-v (default: f16) + -t, --threads (default: 112) + -ngl, --n-gpu-layers (default: 99) + -sm, --split-mode (default: layer) + -mg, --main-gpu (default: 0) + -nkvo, --no-kv-offload <0|1> (default: 0) + -mmp, --mmap <0|1> (default: 1) + -mmq, --mul-mat-q <0|1> (default: 1) + -ts, --tensor_split (default: 0) + -r, --repetitions (default: 5) + -o, --output (default: md) + -v, --verbose (default: 0) Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times. ``` @@ -51,6 +55,10 @@ Each test is repeated the number of times given by `-r`, and the results are ave For a description of the other options, see the [main example](../main/README.md). +Note: + +- When using SYCL backend, there would be hang issue in some cases. Please set `--mmp 0`. + ## Examples ### Text generation with different models diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 4f404f784..90398e6df 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -21,6 +21,7 @@ #include "common.h" #include "build-info.h" #include "ggml-cuda.h" +#include "ggml-sycl.h" // utils static uint64_t get_time_ns() { @@ -121,6 +122,22 @@ static std::string get_gpu_info() { id += "/"; } } +#endif +#ifdef GGML_USE_SYCL + int device_list[GGML_SYCL_MAX_DEVICES]; + ggml_sycl_get_gpu_list(device_list, GGML_SYCL_MAX_DEVICES); + + for (int i = 0; i < GGML_SYCL_MAX_DEVICES; i++) { + if (device_list[i] >0 ){ + char buf[128]; + ggml_sycl_get_device_description(i, buf, sizeof(buf)); + id += buf; + id += "/"; + } + } + if (id.length() >2 ) { + id.pop_back(); + } #endif // TODO: other backends return id; @@ -161,7 +178,8 @@ struct cmd_params { std::vector main_gpu; std::vector no_kv_offload; std::vector mul_mat_q; - std::vector> tensor_split; + std::vector> tensor_split; + std::vector use_mmap; int reps; bool verbose; output_formats output_format; @@ -180,7 +198,8 @@ static const cmd_params cmd_params_defaults = { /* main_gpu */ {0}, /* no_kv_offload */ {false}, /* mul_mat_q */ {true}, - /* tensor_split */ {{}}, + /* tensor_split */ {std::vector(llama_max_devices(), 0.0f)}, + /* use_mmap */ {true}, /* reps */ 5, /* verbose */ false, /* output_format */ MARKDOWN @@ -202,6 +221,7 @@ static void print_usage(int /* argc */, char ** argv) { printf(" -sm, --split-mode (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str()); printf(" -mg, --main-gpu (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str()); printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str()); + printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str()); printf(" -mmq, --mul-mat-q <0|1> (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str()); printf(" -ts, --tensor_split (default: 0)\n"); printf(" -r, --repetitions (default: %d)\n", cmd_params_defaults.reps); @@ -371,6 +391,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { } auto p = split(argv[i], split_delim); params.mul_mat_q.insert(params.mul_mat_q.end(), p.begin(), p.end()); + } else if (arg == "-mmp" || arg == "--mmap") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = split(argv[i], split_delim); + params.use_mmap.insert(params.use_mmap.end(), p.begin(), p.end()); } else if (arg == "-ts" || arg == "--tensor-split") { if (++i >= argc) { invalid_param = true; @@ -381,10 +408,10 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { const std::regex regex{R"([;/]+)"}; std::sregex_token_iterator it{ts.begin(), ts.end(), regex, -1}; std::vector split_arg{it, {}}; - GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES); + GGML_ASSERT(split_arg.size() <= llama_max_devices()); - std::array tensor_split; - for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) { + std::vector tensor_split(llama_max_devices()); + for (size_t i = 0; i < llama_max_devices(); ++i) { if (i < split_arg.size()) { tensor_split[i] = std::stof(split_arg[i]); } else { @@ -442,6 +469,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; } if (params.mul_mat_q.empty()) { params.mul_mat_q = cmd_params_defaults.mul_mat_q; } if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; } + if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; } if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; } return params; @@ -460,7 +488,8 @@ struct cmd_params_instance { int main_gpu; bool no_kv_offload; bool mul_mat_q; - std::array tensor_split; + std::vector tensor_split; + bool use_mmap; llama_model_params to_llama_mparams() const { llama_model_params mparams = llama_model_default_params(); @@ -469,6 +498,7 @@ struct cmd_params_instance { mparams.split_mode = split_mode; mparams.main_gpu = main_gpu; mparams.tensor_split = tensor_split.data(); + mparams.use_mmap = use_mmap; return mparams; } @@ -478,6 +508,7 @@ struct cmd_params_instance { n_gpu_layers == other.n_gpu_layers && split_mode == other.split_mode && main_gpu == other.main_gpu && + use_mmap == other.use_mmap && tensor_split == other.tensor_split; } @@ -504,6 +535,7 @@ static std::vector get_cmd_params_instances(const cmd_param for (const auto & sm : params.split_mode) for (const auto & mg : params.main_gpu) for (const auto & ts : params.tensor_split) + for (const auto & mmp : params.use_mmap) for (const auto & nb : params.n_batch) for (const auto & tk : params.type_k) for (const auto & tv : params.type_v) @@ -528,6 +560,7 @@ static std::vector get_cmd_params_instances(const cmd_param /* .no_kv_offload= */ nkvo, /* .mul_mat_q = */ mmq, /* .tensor_split = */ ts, + /* .use_mmap = */ mmp, }; instances.push_back(instance); } @@ -550,6 +583,7 @@ static std::vector get_cmd_params_instances(const cmd_param /* .no_kv_offload= */ nkvo, /* .mul_mat_q = */ mmq, /* .tensor_split = */ ts, + /* .use_mmap = */ mmp, }; instances.push_back(instance); } @@ -566,6 +600,7 @@ struct test { static const bool vulkan; static const bool kompute; static const bool metal; + static const bool sycl; static const bool gpu_blas; static const bool blas; static const std::string cpu_info; @@ -583,7 +618,8 @@ struct test { int main_gpu; bool no_kv_offload; bool mul_mat_q; - std::array tensor_split; + std::vector tensor_split; + bool use_mmap; int n_prompt; int n_gen; std::string test_time; @@ -606,6 +642,7 @@ struct test { no_kv_offload = inst.no_kv_offload; mul_mat_q = inst.mul_mat_q; tensor_split = inst.tensor_split; + use_mmap = inst.use_mmap; n_prompt = inst.n_prompt; n_gen = inst.n_gen; // RFC 3339 date-time format @@ -655,25 +692,29 @@ struct test { if (metal) { return "Metal"; } + if (sycl) { + return GGML_SYCL_NAME; + } if (gpu_blas) { return "GPU BLAS"; } if (blas) { return "BLAS"; } + return "CPU"; } static const std::vector & get_fields() { static const std::vector fields = { "build_commit", "build_number", - "cuda", "opencl", "vulkan", "kompute", "metal", "gpu_blas", "blas", + "cuda", "opencl", "vulkan", "kompute", "metal", "sycl", "gpu_blas", "blas", "cpu_info", "gpu_info", "model_filename", "model_type", "model_size", "model_n_params", "n_batch", "n_threads", "type_k", "type_v", "n_gpu_layers", "split_mode", "main_gpu", "no_kv_offload", - "mul_mat_q", "tensor_split", + "mul_mat_q", "tensor_split", "use_mmap", "n_prompt", "n_gen", "test_time", "avg_ns", "stddev_ns", "avg_ts", "stddev_ts" @@ -692,8 +733,8 @@ struct test { return INT; } if (field == "cuda" || field == "opencl" || field == "vulkan" || field == "kompute" || field == "metal" || - field == "gpu_blas" || field == "blas" || field == "f16_kv" || field == "no_kv_offload" || - field == "mul_mat_q") { + field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" || + field == "mul_mat_q" || field == "use_mmap") { return BOOL; } if (field == "avg_ts" || field == "stddev_ts") { @@ -705,7 +746,7 @@ struct test { std::vector get_values() const { std::string tensor_split_str; int max_nonzero = 0; - for (int i = 0; i < LLAMA_MAX_DEVICES; i++) { + for (size_t i = 0; i < llama_max_devices(); i++) { if (tensor_split[i] > 0) { max_nonzero = i; } @@ -721,13 +762,13 @@ struct test { std::vector values = { build_commit, std::to_string(build_number), std::to_string(cuda), std::to_string(opencl), std::to_string(vulkan), std::to_string(vulkan), - std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas), + std::to_string(metal), std::to_string(sycl), std::to_string(gpu_blas), std::to_string(blas), cpu_info, gpu_info, model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params), std::to_string(n_batch), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v), std::to_string(n_gpu_layers), split_mode_str(split_mode), std::to_string(main_gpu), std::to_string(no_kv_offload), - std::to_string(mul_mat_q), tensor_split_str, + std::to_string(mul_mat_q), tensor_split_str, std::to_string(use_mmap), std::to_string(n_prompt), std::to_string(n_gen), test_time, std::to_string(avg_ns()), std::to_string(stdev_ns()), std::to_string(avg_ts()), std::to_string(stdev_ts()) @@ -754,6 +795,7 @@ const bool test::kompute = !!ggml_cpu_has_kompute(); const bool test::metal = !!ggml_cpu_has_metal(); const bool test::gpu_blas = !!ggml_cpu_has_gpublas(); const bool test::blas = !!ggml_cpu_has_blas(); +const bool test::sycl = !!ggml_cpu_has_sycl(); const std::string test::cpu_info = get_cpu_info(); const std::string test::gpu_info = get_gpu_info(); @@ -896,6 +938,9 @@ struct markdown_printer : public printer { if (field == "no_kv_offload") { return "nkvo"; } + if (field == "use_mmap") { + return "mmap"; + } if (field == "tensor_split") { return "ts"; } @@ -904,43 +949,46 @@ struct markdown_printer : public printer { void print_header(const cmd_params & params) override { // select fields to print - fields.push_back("model"); - fields.push_back("size"); - fields.push_back("params"); - fields.push_back("backend"); + fields.emplace_back("model"); + fields.emplace_back("size"); + fields.emplace_back("params"); + fields.emplace_back("backend"); bool is_cpu_backend = test::get_backend() == "CPU" || test::get_backend() == "BLAS"; if (!is_cpu_backend) { - fields.push_back("n_gpu_layers"); + fields.emplace_back("n_gpu_layers"); } if (params.n_threads.size() > 1 || params.n_threads != cmd_params_defaults.n_threads || is_cpu_backend) { - fields.push_back("n_threads"); + fields.emplace_back("n_threads"); } if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) { - fields.push_back("n_batch"); + fields.emplace_back("n_batch"); } if (params.type_k.size() > 1 || params.type_k != cmd_params_defaults.type_k) { - fields.push_back("type_k"); + fields.emplace_back("type_k"); } if (params.type_v.size() > 1 || params.type_v != cmd_params_defaults.type_v) { - fields.push_back("type_v"); + fields.emplace_back("type_v"); } if (params.main_gpu.size() > 1 || params.main_gpu != cmd_params_defaults.main_gpu) { - fields.push_back("main_gpu"); + fields.emplace_back("main_gpu"); } if (params.split_mode.size() > 1 || params.split_mode != cmd_params_defaults.split_mode) { - fields.push_back("split_mode"); + fields.emplace_back("split_mode"); } if (params.mul_mat_q.size() > 1 || params.mul_mat_q != cmd_params_defaults.mul_mat_q) { - fields.push_back("mul_mat_q"); + fields.emplace_back("mul_mat_q"); } if (params.no_kv_offload.size() > 1 || params.no_kv_offload != cmd_params_defaults.no_kv_offload) { - fields.push_back("no_kv_offload"); + fields.emplace_back("no_kv_offload"); } if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) { - fields.push_back("tensor_split"); + fields.emplace_back("tensor_split"); } - fields.push_back("test"); - fields.push_back("t/s"); + if (params.use_mmap.size() > 1 || params.use_mmap != cmd_params_defaults.use_mmap) { + fields.emplace_back("use_mmap"); + } + fields.emplace_back("test"); + fields.emplace_back("t/s"); fprintf(fout, "|"); for (const auto & field : fields) { diff --git a/examples/llava/MobileVLM-README.md b/examples/llava/MobileVLM-README.md index c6258eba6..9eba791da 100644 --- a/examples/llava/MobileVLM-README.md +++ b/examples/llava/MobileVLM-README.md @@ -111,17 +111,71 @@ llama_print_timings: eval time = 1279.03 ms / 18 runs ( 71.06 m llama_print_timings: total time = 34570.79 ms ``` +## Orin compile and run +### compile +```sh +make LLAMA_CUBLAS=1 CUDA_DOCKER_ARCH=sm_87 LLAMA_CUDA_F16=1 -j 32 +``` + +### run on Orin +### case 1 +**input** +```sh +./llava-cli \ + -m /data/local/tmp/ggml-model-q4_k.gguf \ + --mmproj /data/local/tmp/mmproj-model-f16.gguf \ + --image /data/local/tmp/demo.jpeg \ + -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: \nWho is the author of this book? \nAnswer the question using a single word or phrase. ASSISTANT:" \ + --n-gpu-layers 999 +``` +**output** +```sh + +encode_image_with_clip: image encoded in 296.62 ms by CLIP ( 2.06 ms per image patch) + + Susan Wise Bauer + +llama_print_timings: load time = 1067.64 ms +llama_print_timings: sample time = 1.53 ms / 6 runs ( 0.25 ms per token, 3934.43 tokens per second) +llama_print_timings: prompt eval time = 306.84 ms / 246 tokens ( 1.25 ms per token, 801.72 tokens per second) +llama_print_timings: eval time = 91.50 ms / 6 runs ( 15.25 ms per token, 65.58 tokens per second) +llama_print_timings: total time = 1352.63 ms / 252 tokens +``` + +### case 2 +**input** +```sh +./llava-cli \ + -m /data/local/tmp/ggml-model-q4_k.gguf \ + --mmproj /data/local/tmp/mmproj-model-f16.gguf \ + -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: \nWhat is in the image? ASSISTANT:" \ + --n-gpu-layers 999 + +``` +**output** +```sh +encode_image_with_clip: image encoded in 302.15 ms by CLIP ( 2.10 ms per image patch) + + The image features a cat lying in the grass. + +llama_print_timings: load time = 1057.07 ms +llama_print_timings: sample time = 3.27 ms / 11 runs ( 0.30 ms per token, 3360.83 tokens per second) +llama_print_timings: prompt eval time = 213.60 ms / 232 tokens ( 0.92 ms per token, 1086.14 tokens per second) +llama_print_timings: eval time = 166.65 ms / 11 runs ( 15.15 ms per token, 66.01 tokens per second) +llama_print_timings: total time = 1365.47 ms / 243 tokens +``` + ## Minor shortcomings The `n_patch` of output in `ldp` is 1/4 of the input. In order to implement quickly, we uniformly modified `clip_n_patches` function to a quarter. when counting the time consumption, the calculated time will be 4 times bigger than the real cost. ## TODO -- [ ] Support non-CPU backend for the new operators, such as `depthwise`, `hardswish`, `hardsigmoid` +- [x] Support non-CPU backend for the new operators, such as `depthwise`, `hardswish`, `hardsigmoid` - [ ] Optimize LDP projector performance - Optimize the structure definition to avoid unnecessary memory rearrangements, to reduce the use of `ggml_permute_cpy`; - Optimize operator implementation (ARM CPU/NVIDIA GPU): such as depthwise conv, hardswish, hardsigmoid, etc. -- [ ] run MobileVLM on `Jetson Orin` +- [x] run MobileVLM on `Jetson Orin` - [ ] Support more model variants, such as `MobileVLM-3B`. diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 0a7dc47e6..5b70b4f9f 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -353,12 +353,12 @@ int main(int argc, char ** argv) { // in instruct mode, we inject a prefix and a suffix to each input by the user if (params.instruct) { params.interactive_first = true; - params.antiprompt.push_back("### Instruction:\n\n"); + params.antiprompt.emplace_back("### Instruction:\n\n"); } // similar for chatml mode else if (params.chatml) { params.interactive_first = true; - params.antiprompt.push_back("<|im_start|>user\n"); + params.antiprompt.emplace_back("<|im_start|>user\n"); } // enable interactive mode if interactive start is specified diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 10bd5282f..a7085f2cb 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -458,14 +458,14 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par std::ofstream logits_stream; if (!params.logits_file.empty()) { - logits_stream.open(params.logits_file.c_str()); + logits_stream.open(params.logits_file.c_str(), std::ios::binary); if (!logits_stream.is_open()) { fprintf(stderr, "%s: failed to open %s for writing\n", __func__, params.logits_file.c_str()); return {}; } fprintf(stderr, "%s: saving all logits to %s\n", __func__, params.logits_file.c_str()); logits_stream.write("_logits_", 8); - logits_stream.write((const char *)&n_ctx, sizeof(n_ctx)); + logits_stream.write(reinterpret_cast(&n_ctx), sizeof(n_ctx)); } auto tim1 = std::chrono::high_resolution_clock::now(); @@ -882,7 +882,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) { size_t li = hs_cur.common_prefix; for (int s = 0; s < 4; ++s) { for (size_t j = hs_cur.common_prefix; j < hs_cur.seq_tokens[s].size() - 1; j++) { - eval_pairs.push_back(std::make_pair(hs_cur.i_batch + li++, hs_cur.seq_tokens[s][j + 1])); + eval_pairs.emplace_back(hs_cur.i_batch + li++, hs_cur.seq_tokens[s][j + 1]); } ++li; } @@ -1160,13 +1160,13 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) { const int last_1st = task.seq_tokens[0].size() - n_base1 > 1 ? 1 : 0; size_t li = n_base1 - 1; for (size_t j = n_base1-1; j < task.seq_tokens[0].size()-1-last_1st; ++j) { - eval_pairs.push_back(std::make_pair(task.i_batch + li++, task.seq_tokens[0][j+1])); + eval_pairs.emplace_back(task.i_batch + li++, task.seq_tokens[0][j+1]); } const auto& n_base2 = skip_choice ? task.n_base2 : task.common_prefix; const int last_2nd = task.seq_tokens[1].size() - n_base2 > 1 ? 1 : 0; li = task.seq_tokens[0].size() - task.common_prefix + n_base2 - 1; for (size_t j = n_base2-1; j < task.seq_tokens[1].size()-1-last_2nd; ++j) { - eval_pairs.push_back(std::make_pair(task.i_batch + li++, task.seq_tokens[1][j+1])); + eval_pairs.emplace_back(task.i_batch + li++, task.seq_tokens[1][j+1]); } } compute_logprobs(batch_logits.data(), n_vocab, workers, eval_pairs, eval_results); @@ -1525,7 +1525,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params size_t li = cur_task.common_prefix; for (int s = 0; s < int(cur_task.seq_tokens.size()); ++s) { for (size_t j = cur_task.common_prefix; j < cur_task.seq_tokens[s].size() - 1; j++) { - eval_pairs.push_back(std::make_pair(cur_task.i_batch + li++, cur_task.seq_tokens[s][j + 1])); + eval_pairs.emplace_back(cur_task.i_batch + li++, cur_task.seq_tokens[s][j + 1]); } ++li; } diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp index c0d3088a5..57973e8b4 100644 --- a/examples/quantize-stats/quantize-stats.cpp +++ b/examples/quantize-stats/quantize-stats.cpp @@ -258,13 +258,13 @@ int main(int argc, char ** argv) { invalid_param = true; break; } - params.include_layers.push_back(argv[i]); + params.include_layers.emplace_back(argv[i]); } else if (arg == "-L" || arg == "--exclude-layer") { if (++i >= argc) { invalid_param = true; break; } - params.exclude_layers.push_back(argv[i]); + params.exclude_layers.emplace_back(argv[i]); } else if (arg == "-t" || arg == "--type") { if (++i >= argc) { invalid_param = true; diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 36e344fe3..2f818ffd9 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -209,13 +209,13 @@ int main(int argc, char ** argv) { } } else if (strcmp(argv[arg_idx], "--include-weights") == 0) { if (arg_idx < argc-1) { - included_weights.push_back(argv[++arg_idx]); + included_weights.emplace_back(argv[++arg_idx]); } else { usage(argv[0]); } } else if (strcmp(argv[arg_idx], "--exclude-weights") == 0) { if (arg_idx < argc-1) { - excluded_weights.push_back(argv[++arg_idx]); + excluded_weights.emplace_back(argv[++arg_idx]); } else { usage(argv[0]); } diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 4e87f566a..b780d7ccf 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1790,28 +1790,28 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms, printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); printf(" not recommended: doubles context memory required and no measurable increase in quality\n"); - if (llama_mlock_supported()) + if (llama_supports_mlock()) { printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n"); } - if (llama_mmap_supported()) + if (llama_supports_mmap()) { printf(" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n"); } printf(" --numa attempt optimizations that help on some NUMA systems\n"); -#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD - printf(" -ngl N, --n-gpu-layers N\n"); - printf(" number of layers to store in VRAM\n"); - printf(" -sm SPLIT_MODE, --split-mode SPLIT_MODE\n"); - printf(" how to split the model across multiple GPUs, one of:\n"); - printf(" - none: use one GPU only\n"); - printf(" - layer (default): split layers and KV across GPUs\n"); - printf(" - row: split rows across GPUs\n"); - printf(" -ts SPLIT --tensor-split SPLIT\n"); - printf(" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n"); - printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n"); - printf(" or for intermediate results and KV (with split-mode = row)\n"); -#endif + if (llama_supports_gpu_offload()) { + printf(" -ngl N, --n-gpu-layers N\n"); + printf(" number of layers to store in VRAM\n"); + printf(" -sm SPLIT_MODE, --split-mode SPLIT_MODE\n"); + printf(" how to split the model across multiple GPUs, one of:\n"); + printf(" - none: use one GPU only\n"); + printf(" - layer (default): split layers and KV across GPUs\n"); + printf(" - row: split rows across GPUs\n"); + printf(" -ts SPLIT --tensor-split SPLIT\n"); + printf(" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n"); + printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n"); + printf(" or for intermediate results and KV (with split-mode = row)\n"); + } printf(" -m FNAME, --model FNAME\n"); printf(" model path (default: %s)\n", params.model.c_str()); printf(" -a ALIAS, --alias ALIAS\n"); @@ -1885,7 +1885,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, invalid_param = true; break; } - sparams.api_keys.push_back(argv[i]); + sparams.api_keys.emplace_back(argv[i]); } else if (arg == "--api-key-file") { @@ -2067,13 +2067,13 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, invalid_param = true; break; } -#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD - params.n_gpu_layers = std::stoi(argv[i]); -#else - LOG_WARNING("Not compiled with GPU offload support, --n-gpu-layers option will be ignored. " + if (llama_supports_gpu_offload()) { + params.n_gpu_layers = std::stoi(argv[i]); + } else { + LOG_WARNING("Not compiled with GPU offload support, --n-gpu-layers option will be ignored. " "See main README.md for information on enabling GPU BLAS support", {{"n_gpu_layers", params.n_gpu_layers}}); -#endif + } } else if (arg == "--split-mode" || arg == "-sm") { @@ -2116,9 +2116,9 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, const std::regex regex{R"([,/]+)"}; std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1}; std::vector split_arg{it, {}}; - GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES); + GGML_ASSERT(split_arg.size() <= llama_max_devices()); - for (size_t i_device = 0; i_device < LLAMA_MAX_DEVICES; ++i_device) + for (size_t i_device = 0; i_device < llama_max_devices(); ++i_device) { if (i_device < split_arg.size()) { @@ -2161,7 +2161,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, invalid_param = true; break; } - params.lora_adapter.push_back(std::make_tuple(argv[i], 1.0f)); + params.lora_adapter.emplace_back(argv[i], 1.0f); params.use_mmap = false; } else if (arg == "--lora-scaled") @@ -2177,7 +2177,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, invalid_param = true; break; } - params.lora_adapter.push_back(std::make_tuple(lora_adapter, std::stof(argv[i]))); + params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i])); params.use_mmap = false; } else if (arg == "--lora-base") @@ -2319,7 +2319,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, } } if (!params.kv_overrides.empty()) { - params.kv_overrides.emplace_back(llama_model_kv_override()); + params.kv_overrides.emplace_back(); params.kv_overrides.back().key[0] = 0; } diff --git a/examples/sycl/ls-sycl-device.cpp b/examples/sycl/ls-sycl-device.cpp index 42847154a..52442e4ca 100644 --- a/examples/sycl/ls-sycl-device.cpp +++ b/examples/sycl/ls-sycl-device.cpp @@ -1,7 +1,9 @@ -/*MIT license - Copyright (C) 2024 Intel Corporation - SPDX-License-Identifier: MIT -*/ +// +// MIT license +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: MIT +// + #include "ggml-sycl.h" diff --git a/examples/sycl/win-run-llama2.bat b/examples/sycl/win-run-llama2.bat index 28d935541..cf621c675 100644 --- a/examples/sycl/win-run-llama2.bat +++ b/examples/sycl/win-run-llama2.bat @@ -2,7 +2,7 @@ :: Copyright (C) 2024 Intel Corporation :: SPDX-License-Identifier: MIT -INPUT2="Building a website can be done in 10 simple steps:\nStep 1:" +set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:" @call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force diff --git a/ggml-cuda.cu b/ggml-cuda.cu index d82014c69..66ea0bd65 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -524,6 +524,8 @@ static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong #define CUDA_SILU_BLOCK_SIZE 256 #define CUDA_TANH_BLOCK_SIZE 256 #define CUDA_RELU_BLOCK_SIZE 256 +#define CUDA_HARDSIGMOID_BLOCK_SIZE 256 +#define CUDA_HARDSWISH_BLOCK_SIZE 256 #define CUDA_SQR_BLOCK_SIZE 256 #define CUDA_CPY_BLOCK_SIZE 32 #define CUDA_SCALE_BLOCK_SIZE 256 @@ -540,6 +542,7 @@ static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong #define CUDA_PAD_BLOCK_SIZE 256 #define CUDA_ACC_BLOCK_SIZE 256 #define CUDA_IM2COL_BLOCK_SIZE 256 +#define CUDA_POOL2D_BLOCK_SIZE 256 #define CUDA_Q8_0_NE_ALIGN 2048 @@ -824,6 +827,24 @@ static __global__ void relu_f32(const float * x, float * dst, const int k) { dst[i] = fmaxf(x[i], 0); } +static __global__ void hardsigmoid_f32(const float * x, float * dst, const int k) { + const int i = blockDim.x*blockIdx.x + threadIdx.x; + + if (i >= k) { + return; + } + dst[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); +} + +static __global__ void hardswish_f32(const float * x, float * dst, const int k) { + const int i = blockDim.x*blockIdx.x + threadIdx.x; + + if (i >= k) { + return; + } + dst[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); +} + static __global__ void leaky_relu_f32(const float * x, float * dst, const int k, const float negative_slope) { const int i = blockDim.x*blockIdx.x + threadIdx.x; if (i >= k) { @@ -5824,7 +5845,7 @@ static __global__ void alibi_f32(const float * x, float * dst, const int ncols, } static __global__ void k_sum_rows_f32(const float * x, float * dst, const int ncols) { - const int row = blockIdx.y; + const int row = blockIdx.x; const int col = threadIdx.x; float sum = 0.0f; @@ -6146,9 +6167,10 @@ static __global__ void clamp_f32(const float * x, float * dst, const float min, dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]); } -static __global__ void im2col_f32_f16( - const float * x, half * dst, - int offset_delta, int IW, int IH, int OW, int KW, int KH, int pelements, int CHW, +template +static __global__ void im2col_kernel( + const float * x, T * dst, int batch_offset, + int offset_delta, int IC, int IW, int IH, int OH, int OW, int KW, int KH, int pelements, int CHW, int s0, int s1, int p0, int p1, int d0, int d1) { const int i = threadIdx.x + blockIdx.x * blockDim.x; if (i >= pelements) { @@ -6161,21 +6183,73 @@ static __global__ void im2col_f32_f16( const int ky = (i - kd) / OW; const int ix = i % OW; + const int oh = blockIdx.y; + const int batch = blockIdx.z / IC; + const int ic = blockIdx.z % IC; + const int64_t iiw = ix * s0 + kx * d0 - p0; - const int64_t iih = blockIdx.y * s1 + ky * d1 - p1; + const int64_t iih = oh * s1 + ky * d1 - p1; const int64_t offset_dst = - (blockIdx.y * OW + ix) * CHW + - (blockIdx.z * (KW * KH) + ky * KW + kx); + ((batch * OH + oh) * OW + ix) * CHW + + (ic * (KW * KH) + ky * KW + kx); if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) { - dst[offset_dst] = __float2half(0.0f); + dst[offset_dst] = 0.0f; } else { - const int64_t offset_src = blockIdx.z * offset_delta; - dst[offset_dst] = __float2half(x[offset_src + iih * IW + iiw]); + const int64_t offset_src = ic * offset_delta + batch * batch_offset; + dst[offset_dst] = x[offset_src + iih * IW + iiw]; } } +template +static __global__ void pool2d_nchw_kernel( + const int ih, const int iw, const int oh, const int ow, + const int kh, const int kw, const int sh, const int sw, + const int ph, const int pw, const int parallel_elements, + const Ti* src, To* dst, const enum ggml_op_pool op) { + int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx >= parallel_elements) { + return; + } + + const int I_HW = ih * iw; + const int O_HW = oh * ow; + const int nc = idx / O_HW; + const int cur_oh = idx % O_HW / ow; + const int cur_ow = idx % O_HW % ow; + const Ti* i_ptr = src + nc * I_HW; + To* o_ptr = dst + nc * O_HW; + const int start_h = cur_oh * sh - ph; + const int bh = max(0, start_h); + const int eh = min(ih, start_h + kh); + const int start_w = cur_ow * sw - pw; + const int bw = max(0, start_w); + const int ew = min(iw, start_w + kw); + const To scale = 1. / (kh * kw); + To res = 0; + + switch (op) { + case GGML_OP_POOL_AVG: res = 0; break; + case GGML_OP_POOL_MAX: res = -FLT_MAX; break; + } + + for (int i = bh; i < eh; i += 1) { + for (int j = bw; j < ew; j += 1) { + #if __CUDA_ARCH__ >= 350 + Ti cur = __ldg(i_ptr + i * iw + j); + #else + Ti cur = i_ptr[i * iw + j]; + #endif + switch (op) { + case GGML_OP_POOL_AVG: res += cur * scale; break; + case GGML_OP_POOL_MAX: res = max(res, (To)cur); break; + } + } + } + o_ptr[cur_oh * ow + cur_ow] = res; +} + template static void get_rows_cuda(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const void * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) { @@ -6389,6 +6463,16 @@ static void relu_f32_cuda(const float * x, float * dst, const int k, cudaStream_ relu_f32<<>>(x, dst, k); } +static void hardsigmoid_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) { + const int num_blocks = (k + CUDA_HARDSIGMOID_BLOCK_SIZE - 1) / CUDA_HARDSIGMOID_BLOCK_SIZE; + hardsigmoid_f32<<>>(x, dst, k); +} + +static void hardswish_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) { + const int num_blocks = (k + CUDA_HARDSWISH_BLOCK_SIZE - 1) / CUDA_HARDSWISH_BLOCK_SIZE; + hardswish_f32<<>>(x, dst, k); +} + static void leaky_relu_f32_cuda(const float * x, float * dst, const int k, const float negative_slope, cudaStream_t stream) { const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE; leaky_relu_f32<<>>(x, dst, k, negative_slope); @@ -7476,7 +7560,7 @@ static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const static void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) { const dim3 block_dims(WARP_SIZE, 1, 1); - const dim3 block_nums(1, nrows, 1); + const dim3 block_nums(nrows, 1, 1); k_sum_rows_f32<<>>(x, dst, ncols); } @@ -7588,14 +7672,15 @@ static void soft_max_f32_cuda(const float * x, const float * y, float * dst, con } } -static void im2col_f32_f16_cuda(const float* x, half* dst, +template +static void im2col_cuda(const float* x, T* dst, int IW, int IH, int OW, int OH, int KW, int KH, int IC, - int offset_delta, + int batch, int batch_offset, int offset_delta, int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) { const int parallel_elements = OW * KW * KH; const int num_blocks = (parallel_elements + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE; - dim3 block_nums(num_blocks, OH, IC); - im2col_f32_f16<<>>(x, dst, offset_delta, IW, IH, OW, KW, KH, parallel_elements, (IC * KH * KW), s0, s1, p0, p1, d0, d1); + dim3 block_nums(num_blocks, OH, batch * IC); + im2col_kernel<<>>(x, dst, batch_offset, offset_delta, IC, IW, IH, OH, OW, KW, KH, parallel_elements, (IC * KH * KW), s0, s1, p0, p1, d0, d1); } // buffer pool for cuda @@ -8171,6 +8256,34 @@ static void ggml_cuda_op_relu( (void) src1_dd; } +static void ggml_cuda_op_hardsigmoid( + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, + const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + hardsigmoid_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream); + + (void) src1; + (void) dst; + (void) src1_dd; +} + +static void ggml_cuda_op_hardswish( + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, + const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + hardswish_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream); + + (void) src1; + (void) dst; + (void) src1_dd; +} + static void ggml_cuda_op_leaky_relu( const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { @@ -8536,9 +8649,9 @@ static void ggml_cuda_op_dequantize_mul_mat_vec( if (src1_convert_f16) { src1_dfloat = src1_dfloat_a.alloc(ne00); - ggml_cpy_f32_f16_cuda((const char *) src1_ddf_i, (char *) src1_dfloat, ne00, - ne00, 1, sizeof(float), 0, 0, - ne00, 1, sizeof(half), 0, 0, stream); + const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type); + GGML_ASSERT(to_fp16_cuda != nullptr); + to_fp16_cuda(src1_ddf_i, src1_dfloat, ne00, stream); } #else const dfloat * src1_dfloat = (const dfloat *) src1_ddf_i; // dfloat == float, no conversion @@ -8802,13 +8915,46 @@ static void ggml_cuda_op_alibi( (void) src1_dd; } +static void ggml_cuda_op_pool2d( + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, + const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + const int32_t * opts = (const int32_t *)dst->op_params; + enum ggml_op_pool op = static_cast(opts[0]); + const int k0 = opts[1]; + const int k1 = opts[2]; + const int s0 = opts[3]; + const int s1 = opts[4]; + const int p0 = opts[5]; + const int p1 = opts[6]; + + const int64_t IH = src0->ne[1]; + const int64_t IW = src0->ne[0]; + + const int64_t N = dst->ne[3]; + const int64_t OC = dst->ne[2]; + const int64_t OH = dst->ne[1]; + const int64_t OW = dst->ne[0]; + + const int parallel_elements = N * OC * OH * OW; + const int num_blocks = (parallel_elements + CUDA_POOL2D_BLOCK_SIZE - 1) / CUDA_POOL2D_BLOCK_SIZE; + dim3 block_nums(num_blocks); + pool2d_nchw_kernel<<>>(IH, IW, OH, OW, k1, k0, s1, s0, p1, p0, parallel_elements, src0_dd, dst_dd, op); + + (void) src1; + (void) src1_dd; +} + static void ggml_cuda_op_im2col( const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src1->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F16); + GGML_ASSERT( dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32); const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; const int32_t s1 = ((const int32_t*)(dst->op_params))[1]; @@ -8830,8 +8976,14 @@ static void ggml_cuda_op_im2col( const int64_t OW = dst->ne[1]; const size_t delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32 + const int64_t batch = src1->ne[3]; + const size_t batch_offset = src1->nb[3] / 4; // nb is byte offset, src is type float32 - im2col_f32_f16_cuda(src1_dd, (half*) dst_dd, IW, IH, OW, OH, KW, KH, IC, delta_offset, s0, s1, p0, p1, d0, d1, main_stream); + if(dst->type == GGML_TYPE_F16) { + im2col_cuda(src1_dd, (half*) dst_dd, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, main_stream); + } else { + im2col_cuda(src1_dd, (float*) dst_dd, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, main_stream); + } (void) src0; (void) src0_dd; @@ -9427,6 +9579,13 @@ static void ggml_cuda_relu(const ggml_tensor * src0, const ggml_tensor * src1, g ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_relu); } +static void ggml_cuda_hardsigmoid(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_hardsigmoid); +} + +static void ggml_cuda_hardswish(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_hardswish); +} static void ggml_cuda_leaky_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_leaky_relu); } @@ -10217,6 +10376,10 @@ static void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_alibi); } +static void ggml_cuda_pool2d(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_pool2d); +} + static void ggml_cuda_im2col(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_im2col); } @@ -10322,6 +10485,12 @@ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, st case GGML_UNARY_OP_RELU: func = ggml_cuda_relu; break; + case GGML_UNARY_OP_HARDSIGMOID: + func = ggml_cuda_hardsigmoid; + break; + case GGML_UNARY_OP_HARDSWISH: + func = ggml_cuda_hardswish; + break; default: return false; } @@ -10396,6 +10565,9 @@ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, st case GGML_OP_IM2COL: func = ggml_cuda_im2col; break; + case GGML_OP_POOL_2D: + func = ggml_cuda_pool2d; + break; case GGML_OP_SUM_ROWS: func = ggml_cuda_sum_rows; break; @@ -11124,6 +11296,8 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons case GGML_UNARY_OP_GELU: case GGML_UNARY_OP_SILU: case GGML_UNARY_OP_RELU: + case GGML_UNARY_OP_HARDSIGMOID: + case GGML_UNARY_OP_HARDSWISH: case GGML_UNARY_OP_GELU_QUICK: case GGML_UNARY_OP_TANH: return true; @@ -11222,6 +11396,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons case GGML_OP_ROPE: case GGML_OP_ALIBI: case GGML_OP_IM2COL: + case GGML_OP_POOL_2D: case GGML_OP_SUM_ROWS: case GGML_OP_ARGSORT: case GGML_OP_ACC: diff --git a/ggml-metal.m b/ggml-metal.m index 615a5fdc5..82dd8af36 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -135,6 +135,7 @@ enum ggml_metal_kernel_type { GGML_METAL_KERNEL_TYPE_ROPE_F16, GGML_METAL_KERNEL_TYPE_ALIBI_F32, GGML_METAL_KERNEL_TYPE_IM2COL_F16, + GGML_METAL_KERNEL_TYPE_IM2COL_F32, GGML_METAL_KERNEL_TYPE_UPSCALE_F32, GGML_METAL_KERNEL_TYPE_PAD_F32, GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, @@ -506,6 +507,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) { GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F16, rope_f16, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ALIBI_F32, alibi_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F16, im2col_f16, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F32, im2col_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true); @@ -630,6 +632,10 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const case GGML_OP_ALIBI: case GGML_OP_ROPE: case GGML_OP_IM2COL: + return true; + case GGML_OP_POOL_1D: + case GGML_OP_POOL_2D: + return false; case GGML_OP_UPSCALE: case GGML_OP_PAD: case GGML_OP_ARGSORT: @@ -2015,7 +2021,7 @@ static bool ggml_metal_graph_compute( { GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src1->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F16); + GGML_ASSERT( dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32); const int32_t s0 = ((const int32_t *)(dst->op_params))[0]; const int32_t s1 = ((const int32_t *)(dst->op_params))[1]; @@ -2023,6 +2029,7 @@ static bool ggml_metal_graph_compute( const int32_t p1 = ((const int32_t *)(dst->op_params))[3]; const int32_t d0 = ((const int32_t *)(dst->op_params))[4]; const int32_t d1 = ((const int32_t *)(dst->op_params))[5]; + const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1; const int32_t N = src1->ne[is_2D ? 3 : 2]; @@ -2043,8 +2050,8 @@ static bool ggml_metal_graph_compute( id pipeline = nil; - switch (src0->type) { - case GGML_TYPE_F32: GGML_ASSERT(false && "not implemented"); break; + switch (dst->type) { + case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_IM2COL_F32].pipeline; break; case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_IM2COL_F16].pipeline; break; default: GGML_ASSERT(false); }; diff --git a/ggml-metal.metal b/ggml-metal.metal index 2614d82e8..efed6ad46 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -1775,9 +1775,29 @@ kernel void kernel_rope( template [[host_name("kernel_rope_f32")]] kernel rope_t kernel_rope; template [[host_name("kernel_rope_f16")]] kernel rope_t kernel_rope; -kernel void kernel_im2col_f16( +typedef void (im2col_t)( device const float * x, - device half * dst, + device char * dst, + constant int32_t & ofs0, + constant int32_t & ofs1, + constant int32_t & IW, + constant int32_t & IH, + constant int32_t & CHW, + constant int32_t & s0, + constant int32_t & s1, + constant int32_t & p0, + constant int32_t & p1, + constant int32_t & d0, + constant int32_t & d1, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tgpg[[threadgroups_per_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]); + +template +kernel void kernel_im2col( + device const float * x, + device char * dst, constant int32_t & ofs0, constant int32_t & ofs1, constant int32_t & IW, @@ -1800,14 +1820,19 @@ kernel void kernel_im2col_f16( (tpitg[0] * tgpg[1] * tgpg[2] + tgpig[1] * tgpg[2] + tgpig[2]) * CHW + (tgpig[0] * (ntg[1] * ntg[2]) + tpitg[1] * ntg[2] + tpitg[2]); + device T * pdst = (device T *) (dst); + if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) { - dst[offset_dst] = 0.0f; + pdst[offset_dst] = 0.0f; } else { const int32_t offset_src = tpitg[0] * ofs0 + tgpig[0] * ofs1; - dst[offset_dst] = x[offset_src + iih * IW + iiw]; + pdst[offset_dst] = x[offset_src + iih * IW + iiw]; } } +template [[host_name("kernel_im2col_f32")]] kernel im2col_t kernel_im2col; +template [[host_name("kernel_im2col_f16")]] kernel im2col_t kernel_im2col; + kernel void kernel_upscale_f32( device const char * src0, device char * dst, diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index 3fc346975..51445b5e7 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -1,7 +1,14 @@ -/*MIT license - Copyright (C) 2024 Intel Corporation - SPDX-License-Identifier: MIT -*/ +// +// MIT license +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: MIT +// + +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// #include #include @@ -330,6 +337,7 @@ namespace dpct } size_t get_global_mem_size() const { return _global_mem_size; } size_t get_local_mem_size() const { return _local_mem_size; } + size_t get_max_mem_alloc_size() const { return _max_mem_alloc_size; } /// Returns the maximum clock rate of device's global memory in kHz. If /// compiler does not support this API then returns default value 3200000 kHz. unsigned int get_memory_clock_rate() const { return _memory_clock_rate; } @@ -391,6 +399,10 @@ namespace dpct { _local_mem_size = local_mem_size; } + void set_max_mem_alloc_size(size_t max_mem_alloc_size) + { + _max_mem_alloc_size = max_mem_alloc_size; + } void set_max_work_group_size(int max_work_group_size) { _max_work_group_size = max_work_group_size; @@ -458,6 +470,7 @@ namespace dpct int _max_register_size_per_work_group; size_t _global_mem_size; size_t _local_mem_size; + size_t _max_mem_alloc_size; size_t _max_nd_range_size[3]; int _max_nd_range_size_i[3]; uint32_t _device_id; @@ -509,6 +522,7 @@ namespace dpct dev.get_info()); prop.set_global_mem_size(dev.get_info()); prop.set_local_mem_size(dev.get_info()); + prop.set_max_mem_alloc_size(dev.get_info()); #if (defined(SYCL_EXT_INTEL_DEVICE_INFO) && SYCL_EXT_INTEL_DEVICE_INFO >= 6) if (dev.has(sycl::aspect::ext_intel_memory_clock_rate)) @@ -637,6 +651,11 @@ namespace dpct return get_device_info().get_global_mem_size(); } + size_t get_max_mem_alloc_size() const + { + return get_device_info().get_max_mem_alloc_size(); + } + /// Get the number of bytes of free and total memory on the SYCL device. /// \param [out] free_memory The number of bytes of free memory on the SYCL device. /// \param [out] total_memory The number of bytes of total memory on the SYCL device. @@ -1347,6 +1366,7 @@ namespace dpct } #else return q.memcpy(to_ptr, from_ptr, size, dep_events); + GGML_UNUSED(direction); #endif // DPCT_USM_LEVEL_NONE } @@ -1648,7 +1668,7 @@ namespace dpct using Ty = typename DataType::T2; Ty s_h; if (get_pointer_attribute(q, s) == pointer_access_attribute::device_only) - detail::dpct_memcpy(q, (void *)&s_h, (void *)s, sizeof(T), device_to_host) + detail::dpct_memcpy(q, (void *)&s_h, (const void *)s, sizeof(T), device_to_host) .wait(); else s_h = *reinterpret_cast(s); @@ -1672,6 +1692,20 @@ namespace dpct int ldb, const void *beta, void *c, int ldc) { #ifndef __INTEL_MKL__ + GGML_UNUSED(q); + GGML_UNUSED(a_trans); + GGML_UNUSED(b_trans); + GGML_UNUSED(m); + GGML_UNUSED(n); + GGML_UNUSED(k); + GGML_UNUSED(alpha); + GGML_UNUSED(a); + GGML_UNUSED(lda); + GGML_UNUSED(b); + GGML_UNUSED(ldb); + GGML_UNUSED(beta); + GGML_UNUSED(c); + GGML_UNUSED(ldc); throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) Interfaces " "Project does not support this API."); #else @@ -1811,7 +1845,7 @@ namespace dpct template T permute_sub_group_by_xor(sycl::sub_group g, T x, unsigned int mask, - int logical_sub_group_size = 32) + unsigned int logical_sub_group_size = 32) { unsigned int id = g.get_local_linear_id(); unsigned int start_index = @@ -2141,6 +2175,7 @@ namespace dpct } #else return q.memcpy(to_ptr, from_ptr, size, dep_events); + GGML_UNUSED(direction); #endif // DPCT_USM_LEVEL_NONE } @@ -2921,7 +2956,6 @@ void ggml_sycl_set_main_device(int main_device); void ggml_sycl_set_mul_mat_q(bool mul_mat_q); void ggml_sycl_set_scratch_size(size_t scratch_size); void ggml_sycl_free_scratch(void); -int ggml_sycl_get_device_count(void); void ggml_sycl_get_device_description(int device, char * description, size_t description_size); bool ggml_backend_is_sycl(ggml_backend_t backend); int ggml_backend_sycl_get_device(ggml_backend_t backend); @@ -3284,7 +3318,7 @@ void log_ggml_var_device(const char*name, float *src, size_t total_elements, boo std::ofstream logfile; logfile.open(filename); // printf("local buf element %d\n", total_elements); - for(int i=0; i max ? max : x[i]); } -static void im2col_f32_f16(const float *x, sycl::half *dst, int offset_delta, +template +static void im2col_kernel(const float *x, T *dst, int offset_delta, int IW, int IH, int OW, int KW, int KH, int pelements, int CHW, int s0, int s1, int p0, int p1, int d0, int d1, @@ -10984,7 +11020,8 @@ static void soft_max_f32_sycl(const float *x, const float *y, float *dst, }); } -static void im2col_f32_f16_sycl(const float *x, sycl::half *dst, int IW, int IH, +template +static void im2col_sycl(const float *x, T *dst, int IW, int IH, int OW, int OH, int KW, int KH, int IC, int offset_delta, int s0, int s1, int p0, int p1, int d0, int d1, @@ -11001,7 +11038,7 @@ static void im2col_f32_f16_sycl(const float *x, sycl::half *dst, int IW, int IH, sycl::range<3>(1, 1, SYCL_IM2COL_BLOCK_SIZE), sycl::range<3>(1, 1, SYCL_IM2COL_BLOCK_SIZE)), [=](sycl::nd_item<3> item_ct1) { - im2col_f32_f16(x, dst, offset_delta, IW, IH, OW, KW, KH, + im2col_kernel(x, dst, offset_delta, IW, IH, OW, KW, KH, parallel_elements, (IC * KH * KW), s0, s1, p0, p1, d0, d1, item_ct1); }); @@ -11138,10 +11175,10 @@ DPCT1082:64: Migration of CUmemGenericAllocationHandle type is not supported. // g_sycl_pool_handles[GGML_SYCL_MAX_DEVICES]; static dpct::device_ptr g_sycl_pool_addr[GGML_SYCL_MAX_DEVICES] = {0}; static size_t g_sycl_pool_used[GGML_SYCL_MAX_DEVICES] = {0}; -static const size_t SYCL_POOL_VMM_MAX_SIZE = 1ull << 36; // 64 GB static void *ggml_sycl_pool_malloc_vmm(size_t size, size_t *actual_size) try { - + GGML_UNUSED(size); + GGML_UNUSED(actual_size); return NULL; } catch (sycl::exception const &exc) { @@ -11305,10 +11342,10 @@ void ggml_init_sycl() try { GGML_ASSERT(g_all_sycl_device_count <= GGML_SYCL_MAX_DEVICES); int64_t total_vram = 0; -#if defined(GGML_SYCL_FP16) - fprintf(stderr, "%s: GGML_SYCL_FP16: yes\n", __func__); +#if defined(GGML_SYCL_F16) + fprintf(stderr, "%s: GGML_SYCL_F16: yes\n", __func__); #else - fprintf(stderr, "%s: GGML_SYCL_FP16: no\n", __func__); + fprintf(stderr, "%s: GGML_SYCL_F16: no\n", __func__); #endif @@ -11331,9 +11368,8 @@ void ggml_init_sycl() try { if(id!=user_device_id) continue; device_inx++; - int device_vmm = 0; - g_device_caps[device_inx].vmm = !!device_vmm; + g_device_caps[device_inx].vmm = 0; g_device_caps[device_inx].device_id = id; g_sycl_device_id2index[id].index = device_inx; @@ -11341,18 +11377,12 @@ void ggml_init_sycl() try { SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info( prop, dpct::dev_mgr::instance().get_device(id)))); - // fprintf(stderr, - // " Device %d: %s, compute capability %d.%d, VMM: %s\n", id, - // prop.get_name(), prop.get_major_version(), - // prop.get_minor_version(), device_vmm ? "yes" : "no"); - g_tensor_split[device_inx] = total_vram; total_vram += prop.get_global_mem_size(); g_device_caps[device_inx].cc = 100 * prop.get_major_version() + 10 * prop.get_minor_version(); - // printf("g_device_caps[%d].cc=%d\n", device_inx, g_device_caps[device_inx].cc); } device_inx = -1; for (int id = 0; id < g_all_sycl_device_count; ++id) { @@ -12188,7 +12218,6 @@ inline void ggml_sycl_op_mul_mat_sycl( // ldc == nrows of the matrix that cuBLAS writes into int ldc = dst->backend == GGML_BACKEND_GPU && device_id == g_main_device ? ne0 : row_diff; - const int compute_capability = g_device_caps[id].cc; #ifdef GGML_SYCL_F16 bool use_fp16 = true; // TODO(Yu) SYCL capability check #else @@ -12397,7 +12426,7 @@ inline void ggml_sycl_op_im2col(const ggml_tensor *src0, GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src1->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F16); + GGML_ASSERT( dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32); const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; const int32_t s1 = ((const int32_t*)(dst->op_params))[1]; @@ -12420,8 +12449,11 @@ inline void ggml_sycl_op_im2col(const ggml_tensor *src0, const size_t delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32 - im2col_f32_f16_sycl(src1_dd, (sycl::half *)dst_dd, IW, IH, OW, OH, KW, KH, - IC, delta_offset, s0, s1, p0, p1, d0, d1, main_stream); + if (dst->type == GGML_TYPE_F16) { + im2col_sycl(src1_dd, (sycl::half *)dst_dd, IW, IH, OW, OH, KW, KH, IC, delta_offset, s0, s1, p0, p1, d0, d1, main_stream); + } else { + im2col_sycl(src1_dd, (float *)dst_dd, IW, IH, OW, OH, KW, KH, IC, delta_offset, s0, s1, p0, p1, d0, d1, main_stream); + } (void) src0; (void) src0_dd; @@ -12673,7 +12705,7 @@ static void ggml_sycl_set_peer_access(const int n_tokens) { continue; } - int can_access_peer; + // int can_access_peer; // SYCL_CHECK(syclDeviceCanAccessPeer(&can_access_peer, id, id_other)); // if (can_access_peer) { // if (enable_peer_access) { @@ -12698,7 +12730,6 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0, const int64_t ne01 = src0->ne[1]; const int64_t ne02 = src0->ne[2]; const int64_t ne03 = src0->ne[3]; - const int64_t nrows0 = ggml_nrows(src0); const int64_t ne10 = src1->ne[0]; const int64_t ne11 = src1->ne[1]; @@ -13794,13 +13825,6 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0, src1_row_extra.data_device[g_main_device_index] = src1_contiguous.get(); dst_row_extra.data_device[g_main_device_index] = dst_contiguous.get(); - const dpct::memcpy_direction src1_kind = - src1->backend == GGML_BACKEND_CPU ? dpct::host_to_device - : dpct::device_to_device; - const dpct::memcpy_direction dst_kind = dst->backend == GGML_BACKEND_CPU - ? dpct::device_to_host - : dpct::device_to_device; - for (int32_t row_id = 0; row_id < n_as; ++row_id) { const struct ggml_tensor * src0_row = dst->src[row_id + 2]; @@ -14486,6 +14510,37 @@ bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_ return true; } +GGML_API GGML_CALL void ggml_sycl_get_gpu_list(int *id_list, int max_len) try { + int max_compute_units = -1; + for(int i=0;i #endif @@ -255,6 +255,7 @@ static size_t vk_staging_offset; static vk_buffer vk_sync_staging; static vk_context * vk_ctx; +static vk_context * vk_transfer_ctx; static bool vk_disable; @@ -264,7 +265,7 @@ size_t vk_output_tensor; #endif static vk_pipeline ggml_vk_create_pipeline(const std::string& name, size_t spv_size, const void* spv_data, const std::string& entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array wg_denoms, std::vector&& specialization_constants, uint32_t align) { -#ifdef VK_DEBUG +#ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_create_pipeline(" << name << ", " << entrypoint << ", " << parameter_count << ", " << push_constant_size << ", (" << wg_denoms[0] << "," << wg_denoms[1] << "," << wg_denoms[2] << "), specialization_constants, " << align << ")" << std::endl; #endif GGML_ASSERT(parameter_count > 0); @@ -368,7 +369,7 @@ static vk_pipeline ggml_vk_create_pipeline(const std::string& name, size_t spv_s } static void ggml_vk_pipeline_allocate_descriptor_sets(vk_pipeline& pipeline, uint32_t n) { -#ifdef VK_DEBUG +#ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_pipeline_allocate_descriptor_sets(" << pipeline.name << ", " << n << ")" << std::endl; #endif // Check if gc already contains pipeline before adding it @@ -413,14 +414,14 @@ static void ggml_vk_pipeline_allocate_descriptor_sets(vk_pipeline& pipeline, uin } static void ggml_vk_pipeline_cleanup(vk_pipeline& pipeline) { -#ifdef VK_DEBUG +#ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_pipeline_cleanup(" << pipeline.name << ")" << std::endl; #endif pipeline.descriptor_set_idx = 0; } static vk::CommandBuffer ggml_vk_create_cmd_buffer(vk_queue& q) { -#ifdef VK_DEBUG +#ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_create_cmd_buffer()" << std::endl; #endif if (q.cmd_buffers.size() > q.cmd_buffer_idx) { @@ -442,7 +443,7 @@ static vk::CommandBuffer ggml_vk_create_cmd_buffer(vk_queue& q) { } static vk_submission ggml_vk_create_submission(vk_queue& q, std::vector wait_semaphores, std::vector signal_semaphores) { -#ifdef VK_DEBUG +#ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_create_submission()" << std::endl; #endif vk_submission s; @@ -453,14 +454,14 @@ static vk_submission ggml_vk_create_submission(vk_queue& q, std::vector wait_semaphores, std::vector signal_semaphores) { -#ifdef VK_DEBUG +#ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_create_sequence_1()" << std::endl; #endif return { ggml_vk_create_submission(q, std::move(wait_semaphores), std::move(signal_semaphores)) }; } static void ggml_vk_submit(vk_context * ctx, vk::Fence fence) { -#ifdef VK_DEBUG +#ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_submit(" << ctx->seqs.size() << ", " << fence << ")" << std::endl; #endif if (ctx->seqs.empty()) { @@ -536,7 +537,7 @@ static void ggml_vk_submit(vk_context * ctx, vk::Fence fence) { } static uint32_t ggml_vk_find_queue_family_index(std::vector& queue_family_props, const vk::QueueFlags& required, const vk::QueueFlags& avoid, int32_t compute_index, uint32_t min_num_queues) { -#ifdef VK_DEBUG +#ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_find_queue_family_index()" << std::endl; #endif const uint32_t qfsize = queue_family_props.size(); @@ -578,7 +579,7 @@ static uint32_t ggml_vk_find_queue_family_index(std::vector= vk_gc.tl_semaphores.size()) { @@ -642,7 +643,7 @@ static vk::Event ggml_vk_create_event() { } static void ggml_vk_queue_cleanup(vk_queue& q) { -#ifdef VK_DEBUG +#ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_queue_cleanup()" << std::endl; #endif // Requires command buffers to be done @@ -652,7 +653,7 @@ static void ggml_vk_queue_cleanup(vk_queue& q) { } static vk_buffer ggml_vk_create_buffer(size_t size, vk::MemoryPropertyFlags req_flags) { -#ifdef VK_DEBUG +#ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_create_buffer(" << size << ", " << to_string(req_flags) << ")" << std::endl; #endif GGML_ASSERT(size > 0); @@ -743,7 +744,7 @@ static void ggml_vk_destroy_buffer(vk_buffer& buf) { if (buf.size == 0) { return; } -#ifdef VK_DEBUG +#ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_destroy_buffer(" << buf.size << ")" << std::endl; #endif @@ -757,7 +758,7 @@ static vk_subbuffer ggml_vk_subbuffer(vk_buffer& buf) { } static void ggml_vk_sync_buffers(vk_context * ctx) { -#ifdef VK_DEBUG +#ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_sync_buffers()" << std::endl; #endif const std::vector mem_barriers{ { { vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite }, { vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite } } }; @@ -773,7 +774,7 @@ static void ggml_vk_sync_buffers(vk_context * ctx) { } static void ggml_vk_wait_events(vk::CommandBuffer& cmd_buffer, std::vector&& events, vk::PipelineStageFlags src_stages, vk::PipelineStageFlags dst_stages) { -#ifdef VK_DEBUG +#ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_wait_events()" << std::endl; #endif if (events.empty()) { @@ -810,7 +811,7 @@ static bool ggml_vk_build_shader(ggml_type type) { } static void ggml_vk_load_shaders() { -#ifdef VK_DEBUG +#ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_load_shaders()" << std::endl; #endif @@ -849,36 +850,6 @@ static void ggml_vk_load_shaders() { vk_pipeline_matmul_f16_f32_aligned_l = ggml_vk_create_pipeline("matmul_f16_f32_aligned_l", matmul_f16_f32_aligned_l_len, matmul_f16_f32_aligned_l_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, l_align); vk_pipeline_matmul_f16_f32_aligned_m = ggml_vk_create_pipeline("matmul_f16_f32_aligned_m", matmul_f16_f32_aligned_m_len, matmul_f16_f32_aligned_m_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, m_align); vk_pipeline_matmul_f16_f32_aligned_s = ggml_vk_create_pipeline("matmul_f16_f32_aligned_s", matmul_f16_f32_aligned_s_len, matmul_f16_f32_aligned_s_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, s_align); - - // Build dequant shaders - vk_pipeline_dequant[GGML_TYPE_F32] = ggml_vk_create_pipeline("f32_to_f16", f32_to_f16_len, f32_to_f16_data, "main", 2, 4 * sizeof(int), {64, 1, 1}, {}, 1); - - vk_pipeline_dequant[GGML_TYPE_F16] = ggml_vk_create_pipeline("dequant_f16", dequant_f16_len, dequant_f16_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1); - vk_pipeline_dequant[GGML_TYPE_Q4_0] = ggml_vk_create_pipeline("dequant_q4_0", dequant_q4_0_len, dequant_q4_0_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1); - vk_pipeline_dequant[GGML_TYPE_Q4_1] = ggml_vk_create_pipeline("dequant_q4_1", dequant_q4_1_len, dequant_q4_1_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1); - vk_pipeline_dequant[GGML_TYPE_Q5_0] = ggml_vk_create_pipeline("dequant_q5_0", dequant_q5_0_len, dequant_q5_0_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1); - vk_pipeline_dequant[GGML_TYPE_Q5_1] = ggml_vk_create_pipeline("dequant_q5_1", dequant_q5_1_len, dequant_q5_1_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1); - vk_pipeline_dequant[GGML_TYPE_Q8_0] = ggml_vk_create_pipeline("dequant_q8_0", dequant_q8_0_len, dequant_q8_0_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1); - vk_pipeline_dequant[GGML_TYPE_Q2_K] = ggml_vk_create_pipeline("dequant_q2_K", dequant_q2_K_len, dequant_q2_K_data, "main", 2, 4 * sizeof(int), {256 * 64, 1, 1}, {}, 1); - vk_pipeline_dequant[GGML_TYPE_Q3_K] = ggml_vk_create_pipeline("dequant_q3_K", dequant_q3_K_len, dequant_q3_K_data, "main", 2, 4 * sizeof(int), {256 * 64, 1, 1}, {}, 1); - vk_pipeline_dequant[GGML_TYPE_Q4_K] = ggml_vk_create_pipeline("dequant_q4_K", dequant_q4_K_len, dequant_q4_K_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1); - vk_pipeline_dequant[GGML_TYPE_Q5_K] = ggml_vk_create_pipeline("dequant_q5_K", dequant_q5_K_len, dequant_q5_K_data, "main", 2, 4 * sizeof(int), {256 * 64, 1, 1}, {}, 1); - vk_pipeline_dequant[GGML_TYPE_Q6_K] = ggml_vk_create_pipeline("dequant_q6_K", dequant_q6_K_len, dequant_q6_K_data, "main", 2, 4 * sizeof(int), {256 * 64, 1, 1}, {}, 1); - - // get_rows - vk_pipeline_get_rows[GGML_TYPE_F16] = ggml_vk_create_pipeline("get_rows_f16", get_rows_f16_len, get_rows_f16_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - vk_pipeline_get_rows[GGML_TYPE_Q4_0] = ggml_vk_create_pipeline("get_rows_q4_0", get_rows_q4_0_len, get_rows_q4_0_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - vk_pipeline_get_rows[GGML_TYPE_Q4_1] = ggml_vk_create_pipeline("get_rows_q4_1", get_rows_q4_1_len, get_rows_q4_1_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - vk_pipeline_get_rows[GGML_TYPE_Q5_0] = ggml_vk_create_pipeline("get_rows_q5_0", get_rows_q5_0_len, get_rows_q5_0_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - vk_pipeline_get_rows[GGML_TYPE_Q5_1] = ggml_vk_create_pipeline("get_rows_q5_1", get_rows_q5_1_len, get_rows_q5_1_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - vk_pipeline_get_rows[GGML_TYPE_Q8_0] = ggml_vk_create_pipeline("get_rows_q8_0", get_rows_q8_0_len, get_rows_q8_0_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - - vk_pipeline_get_rows_f32[GGML_TYPE_F16] = ggml_vk_create_pipeline("get_rows_f16_f32", get_rows_f16_f32_len, get_rows_f16_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - vk_pipeline_get_rows_f32[GGML_TYPE_Q4_0] = ggml_vk_create_pipeline("get_rows_q4_0_f32", get_rows_q4_0_f32_len, get_rows_q4_0_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - vk_pipeline_get_rows_f32[GGML_TYPE_Q4_1] = ggml_vk_create_pipeline("get_rows_q4_1_f32", get_rows_q4_1_f32_len, get_rows_q4_1_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - vk_pipeline_get_rows_f32[GGML_TYPE_Q5_0] = ggml_vk_create_pipeline("get_rows_q5_0_f32", get_rows_q5_0_f32_len, get_rows_q5_0_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - vk_pipeline_get_rows_f32[GGML_TYPE_Q5_1] = ggml_vk_create_pipeline("get_rows_q5_1_f32", get_rows_q5_1_f32_len, get_rows_q5_1_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - vk_pipeline_get_rows_f32[GGML_TYPE_Q8_0] = ggml_vk_create_pipeline("get_rows_q8_0_f32", get_rows_q8_0_f32_len, get_rows_q8_0_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); } else { vk_pipeline_matmul_f32_l = ggml_vk_create_pipeline("matmul_f32_l", matmul_f32_l_fp32_len, matmul_f32_l_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1); vk_pipeline_matmul_f32_m = ggml_vk_create_pipeline("matmul_f32_m", matmul_f32_m_fp32_len, matmul_f32_m_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1); @@ -901,36 +872,6 @@ static void ggml_vk_load_shaders() { vk_pipeline_matmul_f16_f32_aligned_l = ggml_vk_create_pipeline("matmul_f16_f32_aligned_l", matmul_f16_f32_aligned_l_fp32_len, matmul_f16_f32_aligned_l_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, l_align); vk_pipeline_matmul_f16_f32_aligned_m = ggml_vk_create_pipeline("matmul_f16_f32_aligned_m", matmul_f16_f32_aligned_m_fp32_len, matmul_f16_f32_aligned_m_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, m_align); vk_pipeline_matmul_f16_f32_aligned_s = ggml_vk_create_pipeline("matmul_f16_f32_aligned_s", matmul_f16_f32_aligned_s_fp32_len, matmul_f16_f32_aligned_s_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, s_align); - - // Build dequant shaders - vk_pipeline_dequant[GGML_TYPE_F32] = ggml_vk_create_pipeline("f32_to_f16", f32_to_f16_fp32_len, f32_to_f16_fp32_data, "main", 2, 4 * sizeof(int), {64, 1, 1}, {}, 1); - - vk_pipeline_dequant[GGML_TYPE_F16] = ggml_vk_create_pipeline("dequant_f16", dequant_f16_fp32_len, dequant_f16_fp32_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1); - vk_pipeline_dequant[GGML_TYPE_Q4_0] = ggml_vk_create_pipeline("dequant_q4_0", dequant_q4_0_fp32_len, dequant_q4_0_fp32_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1); - vk_pipeline_dequant[GGML_TYPE_Q4_1] = ggml_vk_create_pipeline("dequant_q4_1", dequant_q4_1_fp32_len, dequant_q4_1_fp32_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1); - vk_pipeline_dequant[GGML_TYPE_Q5_0] = ggml_vk_create_pipeline("dequant_q5_0", dequant_q5_0_fp32_len, dequant_q5_0_fp32_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1); - vk_pipeline_dequant[GGML_TYPE_Q5_1] = ggml_vk_create_pipeline("dequant_q5_1", dequant_q5_1_fp32_len, dequant_q5_1_fp32_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1); - vk_pipeline_dequant[GGML_TYPE_Q8_0] = ggml_vk_create_pipeline("dequant_q8_0", dequant_q8_0_fp32_len, dequant_q8_0_fp32_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1); - vk_pipeline_dequant[GGML_TYPE_Q2_K] = ggml_vk_create_pipeline("dequant_q2_K", dequant_q2_K_fp32_len, dequant_q2_K_fp32_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1); - vk_pipeline_dequant[GGML_TYPE_Q3_K] = ggml_vk_create_pipeline("dequant_q3_K", dequant_q3_K_fp32_len, dequant_q3_K_fp32_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1); - vk_pipeline_dequant[GGML_TYPE_Q4_K] = ggml_vk_create_pipeline("dequant_q4_K", dequant_q4_K_fp32_len, dequant_q4_K_fp32_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1); - vk_pipeline_dequant[GGML_TYPE_Q5_K] = ggml_vk_create_pipeline("dequant_q5_K", dequant_q5_K_fp32_len, dequant_q5_K_fp32_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1); - vk_pipeline_dequant[GGML_TYPE_Q6_K] = ggml_vk_create_pipeline("dequant_q6_K", dequant_q6_K_fp32_len, dequant_q6_K_fp32_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1); - - // get_rows - vk_pipeline_get_rows[GGML_TYPE_F16] = ggml_vk_create_pipeline("get_rows_f16", get_rows_f16_fp32_len, get_rows_f16_fp32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - vk_pipeline_get_rows[GGML_TYPE_Q4_0] = ggml_vk_create_pipeline("get_rows_q4_0", get_rows_q4_0_fp32_len, get_rows_q4_0_fp32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - vk_pipeline_get_rows[GGML_TYPE_Q4_1] = ggml_vk_create_pipeline("get_rows_q4_1", get_rows_q4_1_fp32_len, get_rows_q4_1_fp32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - vk_pipeline_get_rows[GGML_TYPE_Q5_0] = ggml_vk_create_pipeline("get_rows_q5_0", get_rows_q5_0_fp32_len, get_rows_q5_0_fp32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - vk_pipeline_get_rows[GGML_TYPE_Q5_1] = ggml_vk_create_pipeline("get_rows_q5_1", get_rows_q5_1_fp32_len, get_rows_q5_1_fp32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - vk_pipeline_get_rows[GGML_TYPE_Q8_0] = ggml_vk_create_pipeline("get_rows_q8_0", get_rows_q8_0_fp32_len, get_rows_q8_0_fp32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - - vk_pipeline_get_rows_f32[GGML_TYPE_F16] = ggml_vk_create_pipeline("get_rows_f16_f32", get_rows_f16_f32_fp32_len, get_rows_f16_f32_fp32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - vk_pipeline_get_rows_f32[GGML_TYPE_Q4_0] = ggml_vk_create_pipeline("get_rows_q4_0_f32", get_rows_q4_0_f32_fp32_len, get_rows_q4_0_f32_fp32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - vk_pipeline_get_rows_f32[GGML_TYPE_Q4_1] = ggml_vk_create_pipeline("get_rows_q4_1_f32", get_rows_q4_1_f32_fp32_len, get_rows_q4_1_f32_fp32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - vk_pipeline_get_rows_f32[GGML_TYPE_Q5_0] = ggml_vk_create_pipeline("get_rows_q5_0_f32", get_rows_q5_0_f32_fp32_len, get_rows_q5_0_f32_fp32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - vk_pipeline_get_rows_f32[GGML_TYPE_Q5_1] = ggml_vk_create_pipeline("get_rows_q5_1_f32", get_rows_q5_1_f32_fp32_len, get_rows_q5_1_f32_fp32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - vk_pipeline_get_rows_f32[GGML_TYPE_Q8_0] = ggml_vk_create_pipeline("get_rows_q8_0_f32", get_rows_q8_0_f32_fp32_len, get_rows_q8_0_f32_fp32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); } vk_pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_F16] = ggml_vk_create_pipeline("mul_mat_vec_f16_f32", mul_mat_vec_f16_f32_len, mul_mat_vec_f16_f32_data, "main", 3, 3 * sizeof(int), {1, 1, 1}, {}, 1); @@ -945,6 +886,36 @@ static void ggml_vk_load_shaders() { vk_pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q5_K] = ggml_vk_create_pipeline("mul_mat_vec_q5_K_f32", mul_mat_vec_q5_K_f32_len, mul_mat_vec_q5_K_f32_data, "main", 3, 3 * sizeof(int), {1, 1, 1}, {}, 1); vk_pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q6_K] = ggml_vk_create_pipeline("mul_mat_vec_q6_K_f32", mul_mat_vec_q6_K_f32_len, mul_mat_vec_q6_K_f32_data, "main", 3, 3 * sizeof(int), {1, 1, 1}, {}, 1); + // dequant shaders + vk_pipeline_dequant[GGML_TYPE_F32] = ggml_vk_create_pipeline("f32_to_f16", f32_to_f16_len, f32_to_f16_data, "main", 2, 4 * sizeof(int), {64, 1, 1}, {}, 1); + + vk_pipeline_dequant[GGML_TYPE_F16] = ggml_vk_create_pipeline("dequant_f16", dequant_f16_len, dequant_f16_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1); + vk_pipeline_dequant[GGML_TYPE_Q4_0] = ggml_vk_create_pipeline("dequant_q4_0", dequant_q4_0_len, dequant_q4_0_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1); + vk_pipeline_dequant[GGML_TYPE_Q4_1] = ggml_vk_create_pipeline("dequant_q4_1", dequant_q4_1_len, dequant_q4_1_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1); + vk_pipeline_dequant[GGML_TYPE_Q5_0] = ggml_vk_create_pipeline("dequant_q5_0", dequant_q5_0_len, dequant_q5_0_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1); + vk_pipeline_dequant[GGML_TYPE_Q5_1] = ggml_vk_create_pipeline("dequant_q5_1", dequant_q5_1_len, dequant_q5_1_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1); + vk_pipeline_dequant[GGML_TYPE_Q8_0] = ggml_vk_create_pipeline("dequant_q8_0", dequant_q8_0_len, dequant_q8_0_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1); + vk_pipeline_dequant[GGML_TYPE_Q2_K] = ggml_vk_create_pipeline("dequant_q2_K", dequant_q2_K_len, dequant_q2_K_data, "main", 2, 4 * sizeof(int), {256 * 64, 1, 1}, {}, 1); + vk_pipeline_dequant[GGML_TYPE_Q3_K] = ggml_vk_create_pipeline("dequant_q3_K", dequant_q3_K_len, dequant_q3_K_data, "main", 2, 4 * sizeof(int), {256 * 64, 1, 1}, {}, 1); + vk_pipeline_dequant[GGML_TYPE_Q4_K] = ggml_vk_create_pipeline("dequant_q4_K", dequant_q4_K_len, dequant_q4_K_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1); + vk_pipeline_dequant[GGML_TYPE_Q5_K] = ggml_vk_create_pipeline("dequant_q5_K", dequant_q5_K_len, dequant_q5_K_data, "main", 2, 4 * sizeof(int), {256 * 64, 1, 1}, {}, 1); + vk_pipeline_dequant[GGML_TYPE_Q6_K] = ggml_vk_create_pipeline("dequant_q6_K", dequant_q6_K_len, dequant_q6_K_data, "main", 2, 4 * sizeof(int), {256 * 64, 1, 1}, {}, 1); + + // get_rows + vk_pipeline_get_rows[GGML_TYPE_F16] = ggml_vk_create_pipeline("get_rows_f16", get_rows_f16_len, get_rows_f16_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + vk_pipeline_get_rows[GGML_TYPE_Q4_0] = ggml_vk_create_pipeline("get_rows_q4_0", get_rows_q4_0_len, get_rows_q4_0_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + vk_pipeline_get_rows[GGML_TYPE_Q4_1] = ggml_vk_create_pipeline("get_rows_q4_1", get_rows_q4_1_len, get_rows_q4_1_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + vk_pipeline_get_rows[GGML_TYPE_Q5_0] = ggml_vk_create_pipeline("get_rows_q5_0", get_rows_q5_0_len, get_rows_q5_0_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + vk_pipeline_get_rows[GGML_TYPE_Q5_1] = ggml_vk_create_pipeline("get_rows_q5_1", get_rows_q5_1_len, get_rows_q5_1_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + vk_pipeline_get_rows[GGML_TYPE_Q8_0] = ggml_vk_create_pipeline("get_rows_q8_0", get_rows_q8_0_len, get_rows_q8_0_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + + vk_pipeline_get_rows_f32[GGML_TYPE_F16] = ggml_vk_create_pipeline("get_rows_f16_f32", get_rows_f16_f32_len, get_rows_f16_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + vk_pipeline_get_rows_f32[GGML_TYPE_Q4_0] = ggml_vk_create_pipeline("get_rows_q4_0_f32", get_rows_q4_0_f32_len, get_rows_q4_0_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + vk_pipeline_get_rows_f32[GGML_TYPE_Q4_1] = ggml_vk_create_pipeline("get_rows_q4_1_f32", get_rows_q4_1_f32_len, get_rows_q4_1_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + vk_pipeline_get_rows_f32[GGML_TYPE_Q5_0] = ggml_vk_create_pipeline("get_rows_q5_0_f32", get_rows_q5_0_f32_len, get_rows_q5_0_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + vk_pipeline_get_rows_f32[GGML_TYPE_Q5_1] = ggml_vk_create_pipeline("get_rows_q5_1_f32", get_rows_q5_1_f32_len, get_rows_q5_1_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + vk_pipeline_get_rows_f32[GGML_TYPE_Q8_0] = ggml_vk_create_pipeline("get_rows_q8_0_f32", get_rows_q8_0_f32_len, get_rows_q8_0_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + vk_pipeline_matmul_split_k_reduce = ggml_vk_create_pipeline("split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256, 1, 1}, {}, 1); vk_pipeline_mul_mat_vec_p021_f16_f32 = ggml_vk_create_pipeline("mul_mat_vec_p021_f16_f32", mul_mat_vec_p021_f16_f32_len, mul_mat_vec_p021_f16_f32_data, "main", 3, 6 * sizeof(uint32_t), {1, 1, 1}, {}, 1); @@ -983,7 +954,7 @@ static void ggml_vk_load_shaders() { } void ggml_vk_init() { -#ifdef VK_DEBUG +#ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_init()" << std::endl; #endif static bool initialized = false; @@ -999,17 +970,17 @@ void ggml_vk_init() { vk::ApplicationInfo app_info{ "ggml-vulkan", 1, nullptr, 0, VK_API_VERSION }; const std::vector layers = { -#ifdef VK_VALIDATE +#ifdef GGML_VULKAN_VALIDATE "VK_LAYER_KHRONOS_validation", #endif }; const std::vector extensions = { -#ifdef VK_VALIDATE +#ifdef GGML_VULKAN_VALIDATE "VK_EXT_validation_features", #endif }; vk::InstanceCreateInfo instance_create_info(vk::InstanceCreateFlags(), &app_info, layers, extensions); -#ifdef VK_VALIDATE +#ifdef GGML_VULKAN_VALIDATE const std::vector features_enable = { vk::ValidationFeatureEnableEXT::eBestPractices }; vk::ValidationFeaturesEXT validation_features = { features_enable, @@ -1120,7 +1091,7 @@ std::cerr << "ggml_vulkan: Validation layers enabled" << std::endl; device_extensions.push_back("VK_KHR_16bit_storage"); -#ifdef VK_VALIDATE +#ifdef GGML_VULKAN_VALIDATE device_extensions.push_back("VK_KHR_shader_non_semantic_info"); #endif @@ -1154,6 +1125,7 @@ std::cerr << "ggml_vulkan: Validation layers enabled" << std::endl; vk_fence = vk_device.device.createFence({}); vk_ctx = nullptr; + vk_transfer_ctx = nullptr; vk_disable = false; @@ -1166,7 +1138,7 @@ std::cerr << "ggml_vulkan: Validation layers enabled" << std::endl; } static vk_pipeline* ggml_vk_get_to_fp16(ggml_type type) { -#ifdef VK_DEBUG +#ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_get_to_fp16()" << std::endl; #endif switch (type) { @@ -1190,7 +1162,7 @@ static vk_pipeline* ggml_vk_get_to_fp16(ggml_type type) { } static vk_pipeline* ggml_vk_get_dequantize_mul_mat_vec(ggml_type type) { -#ifdef VK_DEBUG +#ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_get_dequantize_mul_mat_vec()" << std::endl; #endif switch (type) { @@ -1219,7 +1191,7 @@ static vk_pipeline* ggml_vk_get_dequantize_mul_mat_vec(ggml_type type) { static vk_buffer g_vk_buffer_pool[MAX_VK_BUFFERS]; static vk_buffer ggml_vk_pool_malloc(size_t size) { -#ifdef VK_DEBUG +#ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_pool_malloc(" << size << ")" << std::endl; #endif int best_i = -1; @@ -1253,7 +1225,7 @@ static vk_buffer ggml_vk_pool_malloc(size_t size) { } static void ggml_vk_pool_free(vk_buffer& buffer) { -#ifdef VK_DEBUG +#ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_pool_free(" << buffer.size << ")" << std::endl; #endif for (int i = 0; i < MAX_VK_BUFFERS; ++i) { @@ -1286,7 +1258,7 @@ static vk_buffer ggml_vk_create_buffer_temp(size_t size) { } static void * ggml_vk_host_malloc(size_t size) { -#ifdef VK_DEBUG +#ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_host_malloc(" << size << ")" << std::endl; #endif vk_buffer buf = ggml_vk_create_buffer(size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached); @@ -1309,7 +1281,7 @@ static void ggml_vk_host_free(void* ptr) { if (ptr == nullptr) { return; } -#ifdef VK_DEBUG +#ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_host_free(" << ptr << ")" << std::endl; #endif vk_buffer* buf = nullptr; @@ -1363,7 +1335,7 @@ static void ggml_vk_dispatch_pipeline(vk_context * ctx, vk_pipeline& pipeline, s const uint32_t wg0 = CEIL_DIV(elements[0], pipeline.wg_denoms[0]); const uint32_t wg1 = CEIL_DIV(elements[1], pipeline.wg_denoms[1]); const uint32_t wg2 = CEIL_DIV(elements[2], pipeline.wg_denoms[2]); -#ifdef VK_DEBUG +#ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_dispatch_pipeline(" << pipeline.name << ", (" << wg0 << "," << wg1 << "," << wg2 << "))" << std::endl; #endif std::vector descriptor_buffer_infos; @@ -1398,7 +1370,7 @@ static void ggml_vk_end_submission(vk_submission& s, std::vector w } static void ggml_vk_ctx_end(vk_context * ctx) { -#ifdef VK_DEBUG +#ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_ctx_end(" << ctx << ", " << ctx->seqs.size() << ")" << std::endl; #endif if (ctx->s == nullptr) { @@ -1410,7 +1382,7 @@ static void ggml_vk_ctx_end(vk_context * ctx) { } static void ggml_vk_ctx_begin(vk_context * ctx) { -#ifdef VK_DEBUG +#ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_ctx_begin(" << ctx << ")" << std::endl; #endif if (ctx->s != nullptr) { @@ -1441,7 +1413,7 @@ static void ensure_sync_staging_buffer(size_t size) { } static void ggml_vk_buffer_write_nc_async(vk_context * ctx, vk_buffer* dst, size_t offset, const ggml_tensor * tensor, bool sync_staging = false) { -#ifdef VK_DEBUG +#ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_buffer_write_nc_async(" << tensor << ")" << std::endl; #endif GGML_ASSERT(!ggml_is_contiguous(tensor)); @@ -1548,7 +1520,7 @@ static void ggml_vk_buffer_write_nc_async(vk_context * ctx, vk_buffer* dst, size } static void ggml_vk_buffer_write_2d_async(vk_context * ctx, vk_buffer* dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height, bool sync_staging = false) { -#ifdef VK_DEBUG +#ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_buffer_write_2d_async(" << width << ", " << height << ")" << std::endl; #endif // Buffer is already mapped @@ -1582,7 +1554,7 @@ static void ggml_vk_buffer_write_2d_async(vk_context * ctx, vk_buffer* dst, size ctx->s->buffer.copyBuffer(buf->buffer, dst->buffer, slices); return; } -#ifdef VK_DEBUG +#ifdef GGML_VULKAN_DEBUG std::cerr << "STAGING" << std::endl; #endif @@ -1619,14 +1591,14 @@ static void ggml_vk_buffer_write_2d_async(vk_context * ctx, vk_buffer* dst, size } static void ggml_vk_buffer_write_async(vk_context * ctx, vk_buffer* dst, size_t offset, const void * src, size_t size, bool sync_staging = false) { -#ifdef VK_DEBUG +#ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_buffer_write_async(" << size << ")" << std::endl; #endif return ggml_vk_buffer_write_2d_async(ctx, dst, offset, src, size, size, 1, sync_staging); } static void ggml_vk_buffer_write_2d(vk_buffer* dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height) { -#ifdef VK_DEBUG +#ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_buffer_write_2d(" << width << ", " << height << ")" << std::endl; #endif // Buffer is already mapped @@ -1653,14 +1625,14 @@ static void ggml_vk_buffer_write_2d(vk_buffer* dst, size_t offset, const void * } static void ggml_vk_buffer_write(vk_buffer* dst, size_t offset, const void * src, size_t size) { -#ifdef VK_DEBUG +#ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_buffer_write(" << size << ")" << std::endl; #endif ggml_vk_buffer_write_2d(dst, offset, src, 0, size, 1); } static void ggml_vk_buffer_read_2d_async(vk_context * ctx, vk_buffer* src, size_t offset, void * dst, size_t spitch, size_t dpitch, size_t width, size_t height, bool sync_staging = false) { -#ifdef VK_DEBUG +#ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_buffer_read_2d_async(offset=" << offset << ", width=" << width << ", height=" << height << ")" << std::endl; #endif GGML_ASSERT(width > 0); @@ -1693,7 +1665,7 @@ static void ggml_vk_buffer_read_2d_async(vk_context * ctx, vk_buffer* src, size_ return; } -#ifdef VK_DEBUG +#ifdef GGML_VULKAN_DEBUG std::cerr << "STAGING" << std::endl; #endif @@ -1722,7 +1694,7 @@ static void ggml_vk_buffer_read_async(vk_context * ctx, vk_buffer* src, size_t o } static void ggml_vk_buffer_read(vk_buffer* src, size_t offset, void * dst, size_t size) { -#ifdef VK_DEBUG +#ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_buffer_read(" << offset << ", " << size << ")" << std::endl; #endif if(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) { @@ -1746,7 +1718,7 @@ static void ggml_vk_buffer_read(vk_buffer* src, size_t offset, void * dst, size_ } static void ggml_vk_buffer_copy_async(vk_context * ctx, vk_buffer * dst, size_t dst_offset, vk_buffer * src, size_t src_offset, size_t size) { -#ifdef VK_DEBUG +#ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_buffer_copy_async(" << size << ")" << std::endl; #endif VkBufferCopy bc{ src_offset, dst_offset, size }; @@ -1755,7 +1727,7 @@ static void ggml_vk_buffer_copy_async(vk_context * ctx, vk_buffer * dst, size_t } static void ggml_vk_buffer_copy(vk_buffer * dst, size_t dst_offset, vk_buffer * src, size_t src_offset, size_t size) { -#ifdef VK_DEBUG +#ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_buffer_copy(" << size << ")" << std::endl; #endif VkBufferCopy bc{ src_offset, dst_offset, size }; @@ -1771,7 +1743,7 @@ static void ggml_vk_buffer_copy(vk_buffer * dst, size_t dst_offset, vk_buffer * } static void ggml_vk_buffer_memset(vk_buffer* dst, size_t offset, uint32_t c, size_t size) { -#ifdef VK_DEBUG +#ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_buffer_memset(" << offset << ", " << c << ", " << size << ")" << std::endl; #endif vk_context * ctx = ggml_vk_create_context(vk_device.transfer_queue); @@ -1785,7 +1757,7 @@ static void ggml_vk_buffer_memset(vk_buffer* dst, size_t offset, uint32_t c, siz } static void ggml_vk_h2d_tensor_2d(vk_context * ctx, vk_buffer * dst, size_t offset, const ggml_tensor * src, uint64_t i3, uint64_t i2, uint64_t i1) { -#ifdef VK_DEBUG +#ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_h2d_tensor_2d(dst=" << dst << ", offset=" << offset << ", src=" << src << ", i3=" << i3 << ", i2=" << i2 << ", i1=" << i1 << ")" << std::endl; #endif const uint64_t ne0 = src->ne[0]; @@ -1815,7 +1787,7 @@ static void ggml_vk_h2d_tensor_2d(vk_context * ctx, vk_buffer * dst, size_t offs } static void ggml_vk_d2h_tensor_2d(vk_context * ctx, vk_buffer * src, size_t offset, const ggml_tensor * dst) { -#ifdef VK_DEBUG +#ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_d2h_tensor_2d()" << std::endl; #endif const uint64_t ne0 = dst->ne[0]; @@ -1841,24 +1813,24 @@ static void ggml_vk_d2h_tensor_2d(vk_context * ctx, vk_buffer * src, size_t offs } static uint32_t ggml_vk_guess_split_k(int m, int n, int k) { -#ifdef VK_DEBUG +#ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_guess_split_k(" << m << ", " << n << ", " << k << ")"; #endif if (k > 128 && (m < 128 || n < 128) && m > 2 && n > 2) { -#ifdef VK_DEBUG +#ifdef GGML_VULKAN_DEBUG std::cerr << " = 4" << std::endl; #endif return 4; } -#ifdef VK_DEBUG +#ifdef GGML_VULKAN_DEBUG std::cerr << " = 1" << std::endl; #endif return 1; } static uint32_t ggml_vk_guess_matmul_pipeline_align(int m, int n) { -#ifdef VK_DEBUG +#ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_guess_matmul_pipeline_align(" << m << ", " << n << ")" << std::endl; #endif if (m <= 32 || n <= 32) { @@ -1871,41 +1843,41 @@ static uint32_t ggml_vk_guess_matmul_pipeline_align(int m, int n) { } static vk_pipeline* ggml_vk_guess_matmul_pipeline(bool bit16_x, bool bit16_y, int m, int n, bool aligned) { -#ifdef VK_DEBUG +#ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_guess_matmul_pipeline(" << bit16_x << ", " << bit16_y << ", " << m << ", " << n << ", " << aligned << ")"; #endif if (bit16_x && bit16_y) { - if (m <= 32 || n <= 32) { -#ifdef VK_DEBUG + if (vk_device.vendor_id == VK_VENDOR_ID_INTEL || m <= 32 || n <= 32) { +#ifdef GGML_VULKAN_DEBUG std::cerr << " S" << std::endl; #endif return aligned ? &vk_pipeline_matmul_f16_aligned_s : &vk_pipeline_matmul_f16_s; } if (vk_device.subgroup_size == 64 || m <= 64 || n <= 64) { -#ifdef VK_DEBUG +#ifdef GGML_VULKAN_DEBUG std::cerr << " M" << std::endl; #endif return aligned ? &vk_pipeline_matmul_f16_aligned_m : &vk_pipeline_matmul_f16_m; } -#ifdef VK_DEBUG +#ifdef GGML_VULKAN_DEBUG std::cerr << " L" << std::endl; #endif return aligned ? &vk_pipeline_matmul_f16_aligned_l : &vk_pipeline_matmul_f16_l; } if (bit16_x && !bit16_y) { - if (m <= 32 || n <= 32) { -#ifdef VK_DEBUG + if (vk_device.vendor_id == VK_VENDOR_ID_INTEL || m <= 32 || n <= 32) { +#ifdef GGML_VULKAN_DEBUG std::cerr << " S" << std::endl; #endif return aligned ? &vk_pipeline_matmul_f16_f32_aligned_s : &vk_pipeline_matmul_f16_f32_s; } if (vk_device.subgroup_size == 64 || m <= 64 || n <= 64) { -#ifdef VK_DEBUG +#ifdef GGML_VULKAN_DEBUG std::cerr << " M" << std::endl; #endif return aligned ? &vk_pipeline_matmul_f16_f32_aligned_m : &vk_pipeline_matmul_f16_f32_m; } -#ifdef VK_DEBUG +#ifdef GGML_VULKAN_DEBUG std::cerr << " L" << std::endl; #endif return aligned ? &vk_pipeline_matmul_f16_f32_aligned_l : &vk_pipeline_matmul_f16_f32_l; @@ -1914,30 +1886,30 @@ static vk_pipeline* ggml_vk_guess_matmul_pipeline(bool bit16_x, bool bit16_y, in GGML_ASSERT(false); } - if (m <= 32 || n <= 32) { -#ifdef VK_DEBUG + if (vk_device.vendor_id == VK_VENDOR_ID_INTEL || m <= 32 || n <= 32) { +#ifdef GGML_VULKAN_DEBUG std::cerr << " S" << std::endl; #endif return aligned ? &vk_pipeline_matmul_f32_aligned_s : &vk_pipeline_matmul_f32_s; } if (vk_device.subgroup_size == 64 || m <= 64 || n <= 64) { -#ifdef VK_DEBUG +#ifdef GGML_VULKAN_DEBUG std::cerr << " M" << std::endl; #endif return aligned ? &vk_pipeline_matmul_f32_aligned_m : &vk_pipeline_matmul_f32_m; } -#ifdef VK_DEBUG +#ifdef GGML_VULKAN_DEBUG std::cerr << " L" << std::endl; #endif return aligned ? &vk_pipeline_matmul_f32_aligned_l : &vk_pipeline_matmul_f32_l; } static void ggml_vk_matmul(vk_context * ctx, vk_pipeline& pipeline, vk_subbuffer&& a, vk_subbuffer&& b, vk_subbuffer&& d, vk_subbuffer&& split_k_buffer, uint32_t m, uint32_t n, uint32_t k, uint32_t stride_a, uint32_t stride_b, uint32_t stride_d, uint32_t split_k, uint32_t batch, uint32_t ne02, uint32_t ne12, uint32_t broadcast2, uint32_t broadcast3, uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d) { -#ifdef VK_DEBUG +#ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_matmul(a: (" << a.buffer.buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer.buffer << ", " << b.offset << ", " << b.size << "), c: (" << d.buffer.buffer << ", " << d.offset << ", " << d.size << "), split_k: (" << split_k_buffer.buffer.buffer << ", " << split_k_buffer.offset << ", " << split_k_buffer.size << "), m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", split_k: " << split_k << ", batch: " << batch << ", ne02: " << ne02 << ", ne12: " << ne12 << ", broadcast2: " << broadcast2 << ", broadcast3: " << broadcast3 << ", batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ")" << std::endl; #endif + ggml_vk_sync_buffers(ctx); if (split_k == 1) { - ggml_vk_sync_buffers(ctx); const std::array pc = { m, n, k, stride_a, stride_b, stride_d, k, ne02, ne12, broadcast2, broadcast3, batch_stride_a, batch_stride_b, batch_stride_d }; ggml_vk_dispatch_pipeline(ctx, pipeline, { a, b, d }, pc.size() * sizeof(uint32_t), pc.data(), { m, n, batch }); return; @@ -1945,10 +1917,6 @@ static void ggml_vk_matmul(vk_context * ctx, vk_pipeline& pipeline, vk_subbuffer GGML_ASSERT(batch_stride_d == m * n); - // Synchronize the two submissions - ggml_vk_sync_buffers(ctx); - ctx->s->buffer.fillBuffer(split_k_buffer.buffer.buffer, 0, split_k_buffer.size, 0); - ggml_vk_sync_buffers(ctx); const std::array pc1 = { m, n, k, stride_a, stride_b, stride_d, CEIL_DIV(k, split_k), ne02, ne12, broadcast2, broadcast3, batch_stride_a, batch_stride_b, batch_stride_d }; // Make sure enough workgroups get assigned for split k to work ggml_vk_dispatch_pipeline(ctx, pipeline, { a, b, split_k_buffer }, pc1.size() * sizeof(uint32_t), pc1.data(), { (CEIL_DIV(m, pipeline.wg_denoms[0]) * pipeline.wg_denoms[0]) * split_k, n, batch }); @@ -1980,7 +1948,7 @@ static vk_pipeline * ggml_vk_get_cpy_pipeline(ggml_type from, ggml_type to) { } static void ggml_vk_cpy_to_contiguous(vk_context * ctx, vk_pipeline * pipeline, const ggml_tensor * tensor, vk_subbuffer&& in, vk_subbuffer&& out, ggml_type buffer_type, bool aligned=true) { -#ifdef VK_DEBUG +#ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_cpy_to_contiguous((" << tensor << ", type=" << tensor->type << ", backend=" << tensor->backend << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << "), "; std::cerr << "buffer in size=" << in.buffer.size << ", buffer out size=" << out.buffer.size << ")" << std::endl; #endif @@ -2002,7 +1970,7 @@ static void ggml_vk_cpy_to_contiguous(vk_context * ctx, vk_pipeline * pipeline, } static void ggml_vk_mul_mat_q_f16(vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { -#ifdef VK_DEBUG +#ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_mul_mat_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", backend=" << src0->backend << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", backend=" << src1->backend << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", backend=" << dst->backend << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl; @@ -2186,7 +2154,7 @@ static void ggml_vk_mul_mat_q_f16(vk_context * ctx, const ggml_tensor * src0, co } static void ggml_vk_mul_mat_vec_q_f16(vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { -#ifdef VK_DEBUG +#ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_mul_mat_vec_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", backend=" << src0->backend << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", backend=" << src1->backend << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", backend=" << dst->backend << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl; @@ -2366,7 +2334,7 @@ static void ggml_vk_mul_mat_vec_q_f16(vk_context * ctx, const ggml_tensor * src0 } static void ggml_vk_mul_mat_vec_p021_f16_f32(vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { -#ifdef VK_DEBUG +#ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_mul_mat_p021_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", backend=" << src0->backend << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", backend=" << src1->backend << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", backend=" << dst->backend << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl; @@ -2455,7 +2423,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(vk_context * ctx, const ggml_tensor } static void ggml_vk_mul_mat_vec_nc_f16_f32(vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { -#ifdef VK_DEBUG +#ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_mul_mat_nc_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", backend=" << src0->backend << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", backend=" << src1->backend << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", backend=" << dst->backend << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl; @@ -2561,7 +2529,7 @@ static bool ggml_vk_can_mul_mat(const ggml_tensor * src0, const ggml_tensor * sr } static void ggml_vk_mul_mat(vk_context * ctx, const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { -#ifdef VK_DEBUG +#ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_mul_mat(" << src0 << ", " << src1 << ", " << dst << ")" << std::endl; #endif if (src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) { @@ -2774,7 +2742,7 @@ static void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * template static void ggml_vk_op_f32(vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_op op, const PC&& pc) { -#ifdef VK_DEBUG +#ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", backend=" << src0->backend << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; if (src1 != nullptr) { std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", backend=" << src1->backend << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; @@ -2876,6 +2844,9 @@ static void ggml_vk_op_f32(vk_context * ctx, const ggml_tensor * src0, const ggm x_sz = ggml_nbytes(src0); d_sz = ggml_nbytes(dst); + if (extra_src0->offset + x_sz >= d_X->size) { + x_sz = VK_WHOLE_SIZE; + } if (extra->offset + d_sz >= d_D->size) { d_sz = VK_WHOLE_SIZE; } @@ -2911,12 +2882,16 @@ static void ggml_vk_op_f32(vk_context * ctx, const ggml_tensor * src0, const ggm break; } - x_sz *= ne02 * ne03; - if (y_sz != VK_WHOLE_SIZE) { - y_sz *= ne12 * ne13; - } if (op != GGML_OP_CPY) { - d_sz *= ne02 * ne03; + if (x_sz != VK_WHOLE_SIZE) { + x_sz *= ne02 * ne03; + } + if (y_sz != VK_WHOLE_SIZE) { + y_sz *= ne12 * ne13; + } + if (d_sz != VK_WHOLE_SIZE) { + d_sz *= ne02 * ne03; + } } if (!use_src1 && op == GGML_OP_SOFT_MAX) { @@ -3088,7 +3063,7 @@ static void ggml_vk_nop(vk_context * ctx, const ggml_tensor * src0, ggml_tensor } } -#ifdef VK_RUN_TESTS +#ifdef GGML_VULKAN_RUN_TESTS static void ggml_vk_print_matrix_area(const void * data, ggml_type type, int ne0, int ne1, int i0, int i1, int i2) { if (type != GGML_TYPE_F32 && type != GGML_TYPE_F16) { return; @@ -3122,7 +3097,7 @@ static void ggml_vk_print_matrix_area(const void * data, ggml_type type, int ne0 template static void ggml_vk_test_matmul(size_t m, size_t n, size_t k, size_t batch, size_t num_it, int split_k, int shader_size) { -#ifdef VK_DEBUG +#ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_test_matmul(" << m << ", " << n << ", " << k << ", " << batch << ", " << num_it << ", " << split_k << ", " << shader_size << ")" << std::endl; #endif const size_t x_ne = m * k * batch; @@ -3513,7 +3488,7 @@ static void ggml_vk_test_h2d_nc(size_t ne0, size_t ne1, size_t ne2, size_t ne3) } static void ggml_vk_test_transfer(size_t ne, bool pinned) { -#ifdef VK_DEBUG +#ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_test_transfer(" << ne << ")" << std::endl; #endif // Check transfers are correct @@ -3593,10 +3568,103 @@ static void ggml_vk_test_transfer(size_t ne, bool pinned) { free(y); } } + +static void ggml_vk_test_dequant(size_t ne, ggml_type quant) { +#ifdef GGML_VULKAN_DEBUG + std::cerr << "ggml_vk_test_dequant(" << ne << ")" << std::endl; +#endif + const size_t x_sz = sizeof(float) * ne; + const size_t x_sz_f16 = sizeof(ggml_fp16_t) * ne; + const size_t qx_sz = ne * ggml_type_size(quant)/ggml_blck_size(quant); + float * x = (float *) malloc(x_sz); + void * qx = malloc(qx_sz); + vk_buffer qx_buf = ggml_vk_create_buffer_check(qx_sz, vk::MemoryPropertyFlagBits::eDeviceLocal); + vk_buffer x_buf = ggml_vk_create_buffer_check(x_sz_f16, vk::MemoryPropertyFlagBits::eDeviceLocal); + ggml_fp16_t * x_chk = (ggml_fp16_t *) malloc(x_sz_f16); + + for (size_t i = 0; i < ne; i++) { + x[i] = rand() / (float)RAND_MAX; + } + + std::vector hist_cur(1 << 4, 0); + + vk_pipeline& p = vk_pipeline_dequant[quant]; + + switch(quant) { + case GGML_TYPE_Q4_0: + ggml_quantize_q4_0(x, qx, ne, ne, hist_cur.data()); + break; + case GGML_TYPE_Q4_1: + ggml_quantize_q4_1(x, qx, ne, ne, hist_cur.data()); + break; + case GGML_TYPE_Q5_0: + ggml_quantize_q5_0(x, qx, ne, ne, hist_cur.data()); + break; + case GGML_TYPE_Q5_1: + ggml_quantize_q4_1(x, qx, ne, ne, hist_cur.data()); + break; + case GGML_TYPE_Q8_0: + ggml_quantize_q8_0(x, qx, ne, ne, hist_cur.data()); + break; + case GGML_TYPE_Q2_K: + ggml_quantize_q2_K(x, qx, ne, ne, hist_cur.data()); + break; + case GGML_TYPE_Q3_K: + ggml_quantize_q3_K(x, qx, ne, ne, hist_cur.data()); + break; + case GGML_TYPE_Q4_K: + ggml_quantize_q4_K(x, qx, ne, ne, hist_cur.data()); + break; + case GGML_TYPE_Q5_K: + ggml_quantize_q5_K(x, qx, ne, ne, hist_cur.data()); + break; + case GGML_TYPE_Q6_K: + ggml_quantize_q6_K(x, qx, ne, ne, hist_cur.data()); + break; + default: + GGML_ASSERT(false); + } + + ggml_vk_pipeline_allocate_descriptor_sets(p, 1); + + ggml_vk_buffer_write(&qx_buf, 0, qx, qx_sz); + + vk_context * ctx = ggml_vk_create_context(vk_device.compute_queue); + ggml_vk_ctx_begin(ctx); + const std::vector pc = { 1, (int)ne, (int)ne, (int)ne }; + ggml_vk_sync_buffers(ctx); + ggml_vk_dispatch_pipeline(ctx, p, { { qx_buf, 0, qx_sz }, { x_buf, 0, x_sz_f16 } }, pc.size() * sizeof(int), pc.data(), { (uint32_t)ne, 1, 1}); + ggml_vk_ctx_end(ctx); + + auto begin = std::chrono::high_resolution_clock::now(); + + ggml_vk_submit(ctx, vk_fence); + VK_CHECK(vk_device.device.waitForFences({ vk_fence }, true, UINT64_MAX), "ggml_vk_compute_forward waitForFences"); + vk_device.device.resetFences({ vk_fence }); + + auto end = std::chrono::high_resolution_clock::now(); + + double ms_dequant = std::chrono::duration_cast(end-begin).count() / 1000.0; + ggml_vk_buffer_read(&x_buf, 0, x_chk, x_sz_f16); + + double avg_err = 0.0; + for (size_t i = 0; i < ne; i++) { + avg_err += std::fabs(x[i] - ggml_fp16_to_fp32(x_chk[i])); + } + + std::cerr << "TEST DEQUANT " << ggml_type_name(quant) << " time=" << ms_dequant << "ms avg_err=" << avg_err / ne << std::endl; + + ggml_vk_destroy_buffer(x_buf); + ggml_vk_destroy_buffer(qx_buf); + + free(x); + free(qx); + free(x_chk); +} #endif static ggml_tensor_extra_gpu * ggml_vk_tensor_create_extra(ggml_tensor * tensor) { -#ifdef VK_DEBUG +#ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_create_extra(" << tensor << " (" << tensor->name << ", " << ggml_op_name(tensor->op) << "))" << std::endl; #endif ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu; @@ -3620,7 +3688,7 @@ static ggml_tensor * ggml_vk_find_last_use(const ggml_tensor * node, ggml_cgraph } void ggml_vk_preallocate_buffers_graph(ggml_tensor * node){ -#ifdef VK_DEBUG +#ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_preallocate_buffers_graph(" << node << ")" << std::endl; #endif const bool any_on_device = node->backend == GGML_BACKEND_GPU @@ -3739,15 +3807,26 @@ void ggml_vk_preallocate_buffers() { if (vk_disable) { return; } -#ifdef VK_DEBUG +#ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_preallocate_buffers()" << std::endl; std::cerr << "qx_size: " << vk_prealloc_size_qx << " qy_size: " << vk_prealloc_size_qy << " x_size: " << vk_prealloc_size_x << " y_size: " << vk_prealloc_size_y << " split_k_size: " << vk_prealloc_size_split_k << std::endl; #endif -#if defined(VK_RUN_TESTS) +#if defined(GGML_VULKAN_RUN_TESTS) vk_staging = ggml_vk_create_buffer_check(100ul * 1024ul * 1024ul, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached); ggml_vk_test_transfer(8192 * 1000, false); ggml_vk_test_transfer(8192 * 1000, true); + ggml_vk_test_dequant(2560 * 7680, GGML_TYPE_Q4_0); + ggml_vk_test_dequant(2560 * 7680, GGML_TYPE_Q4_1); + ggml_vk_test_dequant(2560 * 7680, GGML_TYPE_Q5_0); + ggml_vk_test_dequant(2560 * 7680, GGML_TYPE_Q5_1); + ggml_vk_test_dequant(2560 * 7680, GGML_TYPE_Q8_0); + ggml_vk_test_dequant(2560 * 7680, GGML_TYPE_Q2_K); + ggml_vk_test_dequant(2560 * 7680, GGML_TYPE_Q3_K); + ggml_vk_test_dequant(2560 * 7680, GGML_TYPE_Q4_K); + ggml_vk_test_dequant(2560 * 7680, GGML_TYPE_Q5_K); + ggml_vk_test_dequant(2560 * 7680, GGML_TYPE_Q6_K); + const std::vector vals { 8, 8, 8, 100, 46, 576, @@ -3838,7 +3917,7 @@ void ggml_vk_build_graph(ggml_tensor * node, bool last_node){ return; } -#ifdef VK_DEBUG +#ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_build_graph(" << node << ", " << ggml_op_name(node->op) << ")" << std::endl; #endif vk_semaphore_idx = 0; @@ -4061,7 +4140,7 @@ bool ggml_vk_compute_forward(ggml_compute_params * params, ggml_tensor * tensor) return true; } -#ifdef VK_DEBUG +#ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_compute_forward(" << tensor << ", name=" << tensor->name << ", op=" << ggml_op_name(tensor->op) << ", type=" << tensor->type << ", backend=" << tensor->backend << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << ", view_src=" << tensor->view_src << ", view_offs=" << tensor->view_offs << ")" << std::endl; #endif @@ -4104,7 +4183,7 @@ void ggml_vk_graph_cleanup() { if (vk_disable) { return; } -#ifdef VK_DEBUG +#ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_graph_cleanup()" << std::endl; #endif for (auto& buffer : vk_gc.temp_buffers) { @@ -4143,7 +4222,7 @@ void ggml_vk_graph_cleanup() { } static void ggml_vk_cleanup() { -#ifdef VK_DEBUG +#ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_cleanup()" << std::endl; #endif ggml_vk_destroy_buffer(vk_prealloc_x); @@ -4227,7 +4306,7 @@ GGML_CALL static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t bu } GGML_CALL static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { -#ifdef VK_DEBUG +#ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_backend_vk_buffer_init_tensor(" << buffer << " (" << buffer->context << "), " << tensor << ")" << std::endl; #endif ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context; @@ -4247,7 +4326,7 @@ GGML_CALL static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t b } GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { -#ifdef VK_DEBUG +#ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_backend_vk_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl; #endif GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); @@ -4260,7 +4339,7 @@ GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t bu } GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { -#ifdef VK_DEBUG +#ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl; #endif GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); @@ -4316,7 +4395,7 @@ GGML_CALL static const char * ggml_backend_vk_buffer_type_name(ggml_backend_buff } GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { -#ifdef VK_DEBUG +#ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_backend_vk_buffer_type_alloc_buffer(" << size << ")" << std::endl; #endif vk_buffer dev_buffer = ggml_vk_create_buffer_device(size); @@ -4460,7 +4539,7 @@ GGML_CALL static ggml_backend_buffer_type_t ggml_backend_vk_get_default_buffer_t } GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { -#ifdef VK_DEBUG +#ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_backend_vk_set_tensor_async(" << size << ")" << std::endl; #endif GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type() || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type"); @@ -4468,19 +4547,19 @@ GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, g ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; - if (vk_ctx == nullptr) { + if (vk_transfer_ctx == nullptr) { // Initialize new transfer context - vk_ctx = ggml_vk_create_context(vk_device.transfer_queue); - ggml_vk_ctx_begin(vk_ctx); + vk_transfer_ctx = ggml_vk_create_context(vk_device.transfer_queue); + ggml_vk_ctx_begin(vk_transfer_ctx); } - ggml_vk_buffer_write_async(vk_ctx, &extra->buffer_gpu, extra->offset + offset, data, size); + ggml_vk_buffer_write_async(vk_transfer_ctx, &extra->buffer_gpu, extra->offset + offset, data, size); UNUSED(backend); } GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { -#ifdef VK_DEBUG +#ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_backend_vk_get_tensor_async(" << size << ")" << std::endl; #endif GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type() || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type"); @@ -4488,32 +4567,32 @@ GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, c ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; - if (vk_ctx == nullptr) { + if (vk_transfer_ctx == nullptr) { // Initialize new transfer context - vk_ctx = ggml_vk_create_context(vk_device.transfer_queue); - ggml_vk_ctx_begin(vk_ctx); + vk_transfer_ctx = ggml_vk_create_context(vk_device.transfer_queue); + ggml_vk_ctx_begin(vk_transfer_ctx); } - ggml_vk_buffer_read_async(vk_ctx, &extra->buffer_gpu, extra->offset + offset, data, size); + ggml_vk_buffer_read_async(vk_transfer_ctx, &extra->buffer_gpu, extra->offset + offset, data, size); UNUSED(backend); } GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) { -#ifdef VK_DEBUG +#ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_backend_vk_cpy_tensor_async()" << std::endl; #endif if ((dst->buffer->buft == ggml_backend_vk_buffer_type() || dst->buffer->buft == ggml_backend_vk_host_buffer_type()) && ggml_backend_buffer_is_vk(src->buffer)) { ggml_tensor_extra_gpu * src_extra = (ggml_tensor_extra_gpu *) src->extra; ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; - if (vk_ctx == nullptr) { + if (vk_transfer_ctx == nullptr) { // Initialize new transfer context - vk_ctx = ggml_vk_create_context(vk_device.transfer_queue); - ggml_vk_ctx_begin(vk_ctx); + vk_transfer_ctx = ggml_vk_create_context(vk_device.transfer_queue); + ggml_vk_ctx_begin(vk_transfer_ctx); } - ggml_vk_buffer_copy_async(vk_ctx, &src_extra->buffer_gpu, src_extra->offset, &dst_extra->buffer_gpu, dst_extra->offset, ggml_nbytes(src)); + ggml_vk_buffer_copy_async(vk_transfer_ctx, &src_extra->buffer_gpu, src_extra->offset, &dst_extra->buffer_gpu, dst_extra->offset, ggml_nbytes(src)); return true; } @@ -4523,28 +4602,28 @@ GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, c } GGML_CALL static void ggml_backend_vk_synchronize(ggml_backend_t backend) { -#ifdef VK_DEBUG +#ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_backend_vk_synchronize()" << std::endl; #endif - if(vk_ctx == nullptr) { + if(vk_transfer_ctx == nullptr) { return; } - ggml_vk_ctx_end(vk_ctx); + ggml_vk_ctx_end(vk_transfer_ctx); - for (auto& cpy : vk_ctx->in_memcpys) { + for (auto& cpy : vk_transfer_ctx->in_memcpys) { memcpy(cpy.dst, cpy.src, cpy.n); } - ggml_vk_submit(vk_ctx, vk_fence); + ggml_vk_submit(vk_transfer_ctx, vk_fence); VK_CHECK(vk_device.device.waitForFences({ vk_fence }, true, UINT64_MAX), "ggml_backend_vk_synchronize waitForFences"); vk_device.device.resetFences({ vk_fence }); - for (auto& cpy : vk_ctx->out_memcpys) { + for (auto& cpy : vk_transfer_ctx->out_memcpys) { memcpy(cpy.dst, cpy.src, cpy.n); } - vk_ctx = nullptr; + vk_transfer_ctx = nullptr; UNUSED(backend); } diff --git a/ggml.c b/ggml.c index c82276d33..a7c179f4c 100644 --- a/ggml.c +++ b/ggml.c @@ -5349,7 +5349,7 @@ GGML_API struct ggml_tensor * ggml_conv_1d( int s0, int p0, int d0) { - struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, 0, p0, 0, d0, 0, false); // [N, OL, IC * K] + struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, 0, p0, 0, d0, 0, false, GGML_TYPE_F16); // [N, OL, IC * K] struct ggml_tensor * result = ggml_mul_mat(ctx, @@ -5427,16 +5427,15 @@ struct ggml_tensor * ggml_conv_depthwise_2d( int p1, int d0, int d1) { + struct ggml_tensor * new_a = ggml_reshape_4d(ctx, a, a->ne[0], a->ne[1], 1, a->ne[2] * a->ne[3]); struct ggml_tensor * im2col = ggml_im2col(ctx, new_a, ggml_reshape_4d(ctx, b, b->ne[0], b->ne[1], 1, b->ne[2] * b->ne[3]), - s0, s1, p0, p1, d0, d1, true); // [N * IC, OH, OW, KH * KW] - - struct ggml_tensor * result = - ggml_mul_mat(ctx, - ggml_reshape_4d(ctx, new_a, (new_a->ne[0] * new_a->ne[1]), new_a->ne[2], new_a->ne[3], 1), // [OC,1, KH, KW] => [1, OC, 1, KH * KW] - ggml_reshape_4d(ctx, im2col, im2col->ne[0], im2col->ne[2] * im2col->ne[1], b->ne[2], b->ne[3])); // [N * IC, OH, OW, KH * KW] => [N, IC, OH * OW, KH * KW] + s0, s1, p0, p1, d0, d1, true, GGML_TYPE_F16); // [N * IC, OH, OW, KH * KW] + struct ggml_tensor * new_b = ggml_reshape_4d(ctx, im2col, im2col->ne[0], im2col->ne[2] * im2col->ne[1], b->ne[2], b->ne[3]); // [N * IC, OH, OW, KH * KW] => [N, IC, OH * OW, KH * KW] + new_a = ggml_reshape_4d(ctx, new_a, (new_a->ne[0] * new_a->ne[1]), new_a->ne[2], new_a->ne[3], 1); // [OC,1, KH, KW] => [1, OC, 1, KH * KW] + struct ggml_tensor * result = ggml_mul_mat(ctx, new_a, new_b); result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], b->ne[2], b->ne[3]); // [N, OC, OH, OW] return result; @@ -5457,7 +5456,8 @@ struct ggml_tensor * ggml_im2col( int p1, int d0, int d1, - bool is_2D) { + bool is_2D, + enum ggml_type dst_type) { if(is_2D) { GGML_ASSERT(a->ne[2] == b->ne[2]); @@ -5481,7 +5481,7 @@ struct ggml_tensor * ggml_im2col( is_2D ? b->ne[3] : 1, }; - struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 4, ne); + struct ggml_tensor * result = ggml_new_tensor(ctx, dst_type, 4, ne); int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) }; ggml_set_op_params(result, params, sizeof(params)); @@ -5506,7 +5506,7 @@ struct ggml_tensor * ggml_conv_2d( int p1, int d0, int d1) { - struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, s1, p0, p1, d0, d1, true); // [N, OH, OW, IC * KH * KW] + struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, s1, p0, p1, d0, d1, true, GGML_TYPE_F16); // [N, OH, OW, IC * KH * KW] struct ggml_tensor * result = ggml_mul_mat(ctx, @@ -5632,12 +5632,13 @@ struct ggml_tensor * ggml_pool_2d( is_node = true; } + struct ggml_tensor * result; const int64_t ne[3] = { ggml_calc_pool_output_size(a->ne[0], k0, s0, p0), ggml_calc_pool_output_size(a->ne[1], k1, s1, p1), a->ne[2], }; - struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne); + result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne); int32_t params[] = { op, k0, k1, s0, s1, p0, p1 }; ggml_set_op_params(result, params, sizeof(params)); @@ -5645,7 +5646,6 @@ struct ggml_tensor * ggml_pool_2d( result->op = GGML_OP_POOL_2D; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - return result; } @@ -12493,6 +12493,92 @@ static void ggml_compute_forward_conv_transpose_1d( } } +// src0: kernel [OC, IC, KH, KW] +// src1: image [N, IC, IH, IW] +// dst: result [N, OH, OW, IC*KH*KW] +static void ggml_compute_forward_im2col_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + int64_t t0 = ggml_perf_time_us(); + UNUSED(t0); + + GGML_TENSOR_BINARY_OP_LOCALS; + + const int32_t s0 = ((const int32_t *)(dst->op_params))[0]; + const int32_t s1 = ((const int32_t *)(dst->op_params))[1]; + const int32_t p0 = ((const int32_t *)(dst->op_params))[2]; + const int32_t p1 = ((const int32_t *)(dst->op_params))[3]; + const int32_t d0 = ((const int32_t *)(dst->op_params))[4]; + const int32_t d1 = ((const int32_t *)(dst->op_params))[5]; + const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1; + + const int ith = params->ith; + const int nth = params->nth; + + const int64_t N = is_2D ? ne13 : ne12; + const int64_t IC = is_2D ? ne12 : ne11; + const int64_t IH = is_2D ? ne11 : 1; + const int64_t IW = ne10; + + const int64_t KH = is_2D ? ne01 : 1; + const int64_t KW = ne00; + + const int64_t OH = is_2D ? ne2 : 1; + const int64_t OW = ne1; + + int ofs0 = is_2D ? nb13 : nb12; + int ofs1 = is_2D ? nb12 : nb11; + + GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nb10 == sizeof(float)); + + if (params->type == GGML_TASK_INIT) { + return; + } + + if (params->type == GGML_TASK_FINALIZE) { + return; + } + + // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW] + { + float * const wdata = (float *) dst->data; + + for (int64_t in = 0; in < N; in++) { + for (int64_t ioh = 0; ioh < OH; ioh++) { // 1 + for (int64_t iow = 0; iow < OW; iow++) { + for (int64_t iic = ith; iic < IC; iic += nth) { + + // micro kernel + float * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW] + const float * const src_data = (float *)((char *) src1->data + in*ofs0 + iic*ofs1); // [IH, IW] + + for (int64_t ikh = 0; ikh < KH; ikh++) { // 1 + for (int64_t ikw = 0; ikw < KW; ikw++) { + const int64_t iiw = iow*s0 + ikw*d0 - p0; + const int64_t iih = ioh*s1 + ikh*d1 - p1; + + if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) { + dst_data[iic*(KH*KW) + ikh*KW + ikw] = 0; + } else { + dst_data[iic*(KH*KW) + ikh*KW + ikw] = (src_data[iih*IW + iiw]); + } + } + } + } + } + } + } + } +} + + // src0: kernel [OC, IC, KH, KW] // src1: image [N, IC, IH, IW] // dst: result [N, OH, OW, IC*KH*KW] @@ -12583,14 +12669,14 @@ static void ggml_compute_forward_im2col( const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { - switch (src0->type) { + switch (dst->type) { case GGML_TYPE_F16: { ggml_compute_forward_im2col_f16(params, src0, src1, dst); } break; case GGML_TYPE_F32: { - GGML_ASSERT(false); + ggml_compute_forward_im2col_f32(params, src0, src1, dst); } break; default: { @@ -12781,8 +12867,8 @@ static void ggml_compute_forward_pool_2d( const struct ggml_compute_params * params, const struct ggml_tensor * src, struct ggml_tensor * dst) { - assert(src->type == GGML_TYPE_F32); - assert(params->ith == 0); + GGML_ASSERT(src->type == GGML_TYPE_F32); + GGML_ASSERT(params->ith == 0); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; @@ -16985,12 +17071,16 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa struct ggml_cplan cplan; memset(&cplan, 0, sizeof(struct ggml_cplan)); + int max_tasks = 1; + // thread scheduling for the different operations + work buffer size estimation for (int i = 0; i < cgraph->n_nodes; i++) { struct ggml_tensor * node = cgraph->nodes[i]; const int n_tasks = ggml_get_n_tasks(node, n_threads); + max_tasks = MAX(max_tasks, n_tasks); + size_t cur = 0; switch (node->op) { @@ -17157,7 +17247,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa work_size += CACHE_LINE_SIZE*(n_threads - 1); } - cplan.n_threads = n_threads; + cplan.n_threads = MIN(max_tasks, n_threads); cplan.work_size = work_size; cplan.work_data = NULL; diff --git a/ggml.h b/ggml.h index c0b1ac60e..030515e16 100644 --- a/ggml.h +++ b/ggml.h @@ -1502,7 +1502,8 @@ extern "C" { int p1, int d0, int d1, - bool is_2D); + bool is_2D, + enum ggml_type dst_type); GGML_API struct ggml_tensor * ggml_conv_depthwise_2d( struct ggml_context * ctx, diff --git a/ggml_vk_generate_shaders.py b/ggml_vk_generate_shaders.py index 6b1b82bf3..4abb0383f 100644 --- a/ggml_vk_generate_shaders.py +++ b/ggml_vk_generate_shaders.py @@ -157,19 +157,10 @@ struct block_q6_K # Dequant functions shader_f16_dequant_func = """ -#define DEQUANT_FUNC f16vec2 v = f16vec2(data_a[ib + 0], data_a[ib + 1]); -""" -shader_f16_dequant_func_compat = """ #define DEQUANT_FUNC vec2 v = vec2(data_a[ib + 0], data_a[ib + 1]); """ shader_q4_0_dequant_func = """ -#define DEQUANT_FUNC const float16_t d = data_a[ib].d; \ -const uint8_t vui = data_a[ib].qs[iqs]; \ -f16vec2 v = f16vec2(vui & 0xF, vui >> 4); \ -v = (v - 8.0hf)*d; -""" -shader_q4_0_dequant_func_compat = """ #define DEQUANT_FUNC const float d = float(data_a[ib].d); \ const uint vui = uint(data_a[ib].qs[iqs]); \ vec2 v = vec2(vui & 0xF, vui >> 4); \ @@ -177,13 +168,6 @@ v = (v - 8.0f)*d; """ shader_q4_1_dequant_func = """ -#define DEQUANT_FUNC const float16_t d = data_a[ib].d; \ -const float16_t m = data_a[ib].m; \ -const uint8_t vui = data_a[ib].qs[iqs]; \ -f16vec2 v = f16vec2(vui & 0xF, vui >> 4); \ -v = v*d + m; -""" -shader_q4_1_dequant_func_compat = """ #define DEQUANT_FUNC const float d = float(data_a[ib].d); \ const float m = float(data_a[ib].m); \ const uint vui = uint(data_a[ib].qs[iqs]); \ @@ -192,14 +176,6 @@ v = v*d + m; """ shader_q5_0_dequant_func = """ -#define DEQUANT_FUNC const float16_t d = data_a[ib].d; \ -const uint uint_qh = uint(data_a[ib].qh[1]) << 16 | data_a[ib].qh[0]; \ -const ivec2 qh = ivec2(((uint_qh >> iqs) << 4) & 0x10, (uint_qh >> (iqs + 12)) & 0x10); \ -const uint8_t vui = data_a[ib].qs[iqs]; \ -f16vec2 v = f16vec2((vui & 0xF) | qh.x, (vui >> 4) | qh.y); \ -v = (v - 16.0hf) * d; -""" -shader_q5_0_dequant_func_compat = """ #define DEQUANT_FUNC const float d = float(data_a[ib].d); \ const uint uint_qh = uint(data_a[ib].qh[1]) << 16 | data_a[ib].qh[0]; \ const ivec2 qh = ivec2(((uint_qh >> iqs) << 4) & 0x10, (uint_qh >> (iqs + 12)) & 0x10); \ @@ -209,14 +185,6 @@ v = (v - 16.0f) * d; """ shader_q5_1_dequant_func = """ -#define DEQUANT_FUNC const float16_t d = data_a[ib].d; \ -const float16_t m = data_a[ib].m; \ -const ivec2 qh = ivec2(((data_a[ib].qh >> iqs) << 4) & 0x10, (data_a[ib].qh >> (iqs + 12)) & 0x10); \ -const uint8_t vui = data_a[ib].qs[iqs]; \ -f16vec2 v = f16vec2((vui & 0xF) | qh.x, (vui >> 4) | qh.y); \ -v = v*d + m; -""" -shader_q5_1_dequant_func_compat = """ #define DEQUANT_FUNC const float d = float(data_a[ib].d); \ const float m = float(data_a[ib].m); \ const ivec2 qh = ivec2(((data_a[ib].qh >> iqs) << 4) & 0x10, (data_a[ib].qh >> (iqs + 12)) & 0x10); \ @@ -226,11 +194,6 @@ v = v*d + m; """ shader_q8_0_dequant_func = """ -#define DEQUANT_FUNC const float16_t d = data_a[ib].d; \ -f16vec2 v = f16vec2(data_a[ib].qs[iqs], data_a[ib].qs[iqs + 1]); \ -v = v * d; -""" -shader_q8_0_dequant_func_compat = """ #define DEQUANT_FUNC const float d = float(data_a[ib].d); \ vec2 v = vec2(int(data_a[ib].qs[iqs]), int(data_a[ib].qs[iqs + 1])); \ v = v * d; @@ -1689,7 +1652,8 @@ void main() { } const float xi = float(data_a[i]); - data_d[i] = D_TYPE(0.5f*xi*(1.0f + tanh(SQRT_2_OVER_PI*xi*(1.0f + GELU_COEF_A*xi*xi)))); + const float val = SQRT_2_OVER_PI*xi*(1.0f + GELU_COEF_A*xi*xi); + data_d[i] = D_TYPE(0.5f*xi*(2.0f - 2.0f / (exp(2 * val) + 1))); } """ @@ -2109,7 +2073,7 @@ lock = asyncio.Lock() shader_fnames = [] -async def string_to_spv(name, code, defines, fp16): +async def string_to_spv(name, code, defines, fp16=True): f = NamedTemporaryFile(mode="w", delete=False) f.write(code) f.flush() @@ -2199,64 +2163,6 @@ async def main(): tasks.append(string_to_spv("matmul_f16_f32_aligned_m", "".join(stream), {"LOAD_VEC": load_vec, "A_TYPE": vec_type_f16, "B_TYPE": vec_type, "D_TYPE": "float"}, fp16)) tasks.append(string_to_spv("matmul_f16_f32_aligned_s", "".join(stream), {"LOAD_VEC": load_vec, "A_TYPE": vec_type_f16, "B_TYPE": vec_type, "D_TYPE": "float"}, fp16)) - # Build dequant shaders - tasks.append(string_to_spv("f32_to_f16", f32_to_f16_src, {}, fp16)) - - for i in range(0, VK_NUM_TYPES): - stream.clear() - - stream.extend((dequant_head, shader_int8_ext, shader_float_type)) - - if i == GGML_TYPE_F16: - stream.extend((shader_f16_defines, shader_f16_dequant_func_compat if not fp16 else shader_f16_dequant_func, dequant_body)) - elif i == GGML_TYPE_Q4_0: - stream.extend((shader_q4_0_defines, shader_q4_0_dequant_func_compat if not fp16 else shader_q4_0_dequant_func, dequant_body)) - elif i == GGML_TYPE_Q4_1: - stream.extend((shader_q4_1_defines, shader_q4_1_dequant_func_compat if not fp16 else shader_q4_1_dequant_func, dequant_body)) - elif i == GGML_TYPE_Q5_0: - stream.extend((shader_q5_0_defines, shader_q5_0_dequant_func_compat if not fp16 else shader_q5_0_dequant_func, dequant_body)) - elif i == GGML_TYPE_Q5_1: - stream.extend((shader_q5_1_defines, shader_q5_1_dequant_func_compat if not fp16 else shader_q5_1_dequant_func, dequant_body)) - elif i == GGML_TYPE_Q8_0: - stream.extend((shader_q8_0_defines, shader_q8_0_dequant_func_compat if not fp16 else shader_q8_0_dequant_func, dequant_body)) - elif i == GGML_TYPE_Q2_K: - stream.extend((shader_q2_K_defines, dequant_q2_K_body)) - elif i == GGML_TYPE_Q3_K: - stream.extend((shader_q3_K_defines, dequant_q3_K_body)) - elif i == GGML_TYPE_Q4_K: - stream.extend((shader_q4_K_defines, dequant_q4_K_body)) - elif i == GGML_TYPE_Q5_K: - stream.extend((shader_q5_K_defines, dequant_q5_K_body)) - elif i == GGML_TYPE_Q6_K: - stream.extend((shader_q6_K_defines, dequant_q6_K_body)) - else: - continue - - tasks.append(string_to_spv(f"dequant_{type_names[i]}", "".join(stream), {"D_TYPE": "float16_t"}, fp16)) - - # get_rows - for i in range(0, VK_NUM_TYPES): - stream.clear() - stream.extend((generic_head, shader_int8_ext, shader_float_type)) - - if i == GGML_TYPE_F16: - stream.extend((shader_f16_defines, shader_f16_dequant_func_compat if not fp16 else shader_f16_dequant_func, get_rows_body)) - elif i == GGML_TYPE_Q4_0: - stream.extend((shader_q4_0_defines, shader_q4_0_dequant_func_compat if not fp16 else shader_q4_0_dequant_func, get_rows_body)) - elif i == GGML_TYPE_Q4_1: - stream.extend((shader_q4_1_defines, shader_q4_1_dequant_func_compat if not fp16 else shader_q4_1_dequant_func, get_rows_body)) - elif i == GGML_TYPE_Q5_0: - stream.extend((shader_q5_0_defines, shader_q5_0_dequant_func_compat if not fp16 else shader_q5_0_dequant_func, get_rows_body)) - elif i == GGML_TYPE_Q5_1: - stream.extend((shader_q5_1_defines, shader_q5_1_dequant_func_compat if not fp16 else shader_q5_1_dequant_func, get_rows_body)) - elif i == GGML_TYPE_Q8_0: - stream.extend((shader_q8_0_defines, shader_q8_0_dequant_func_compat if not fp16 else shader_q8_0_dequant_func, get_rows_body)) - else: - continue - - tasks.append(string_to_spv(f"get_rows_{type_names[i]}", "".join(stream), {"B_TYPE": "float", "D_TYPE": "float16_t"}, fp16)) - tasks.append(string_to_spv(f"get_rows_{type_names[i]}_f32", "".join(stream), {"B_TYPE": "float", "D_TYPE": "float"}, fp16)) - # Shaders where precision is needed, so no fp16 version # mul mat vec @@ -2265,17 +2171,17 @@ async def main(): stream.extend((mul_mat_vec_head, shader_int8_ext, shader_f32)) if i == GGML_TYPE_F16: - stream.extend((shader_f16_defines, shader_f16_dequant_func_compat, mul_mat_vec_body)) + stream.extend((shader_f16_defines, shader_f16_dequant_func, mul_mat_vec_body)) elif i == GGML_TYPE_Q4_0: - stream.extend((shader_q4_0_defines, shader_q4_0_dequant_func_compat, mul_mat_vec_body)) + stream.extend((shader_q4_0_defines, shader_q4_0_dequant_func, mul_mat_vec_body)) elif i == GGML_TYPE_Q4_1: - stream.extend((shader_q4_1_defines, shader_q4_1_dequant_func_compat, mul_mat_vec_body)) + stream.extend((shader_q4_1_defines, shader_q4_1_dequant_func, mul_mat_vec_body)) elif i == GGML_TYPE_Q5_0: - stream.extend((shader_q5_0_defines, shader_q5_0_dequant_func_compat, mul_mat_vec_body)) + stream.extend((shader_q5_0_defines, shader_q5_0_dequant_func, mul_mat_vec_body)) elif i == GGML_TYPE_Q5_1: - stream.extend((shader_q5_1_defines, shader_q5_1_dequant_func_compat, mul_mat_vec_body)) + stream.extend((shader_q5_1_defines, shader_q5_1_dequant_func, mul_mat_vec_body)) elif i == GGML_TYPE_Q8_0: - stream.extend((shader_q8_0_defines, shader_q8_0_dequant_func_compat, mul_mat_vec_body)) + stream.extend((shader_q8_0_defines, shader_q8_0_dequant_func, mul_mat_vec_body)) elif i == GGML_TYPE_Q2_K: stream.extend((shader_q2_K_defines, mul_mat_vec_q2_K_body)) elif i == GGML_TYPE_Q3_K: @@ -2289,43 +2195,101 @@ async def main(): else: continue - tasks.append(string_to_spv(f"mul_mat_vec_{type_names[i]}_f32", "".join(stream), {"B_TYPE": "float", "D_TYPE": "float", "K_QUANTS_PER_ITERATION": K_QUANTS_PER_ITERATION}, fp16)) + tasks.append(string_to_spv(f"mul_mat_vec_{type_names[i]}_f32", "".join(stream), {"B_TYPE": "float", "D_TYPE": "float", "K_QUANTS_PER_ITERATION": K_QUANTS_PER_ITERATION})) - tasks.append(string_to_spv("mul_mat_vec_p021_f16_f32", mul_mat_p021_src, {"A_TYPE": "float16_t", "B_TYPE": "float", "D_TYPE": "float"}, True)) - tasks.append(string_to_spv("mul_mat_vec_nc_f16_f32", mul_mat_nc_src, {"A_TYPE": "float16_t", "B_TYPE": "float", "D_TYPE": "float"}, True)) + # Dequant shaders + for i in range(0, VK_NUM_TYPES): + stream.clear() + + stream.extend((dequant_head, shader_int8_ext, shader_f32)) + + if i == GGML_TYPE_F16: + stream.extend((shader_f16_defines, shader_f16_dequant_func, dequant_body)) + elif i == GGML_TYPE_Q4_0: + stream.extend((shader_q4_0_defines, shader_q4_0_dequant_func, dequant_body)) + elif i == GGML_TYPE_Q4_1: + stream.extend((shader_q4_1_defines, shader_q4_1_dequant_func, dequant_body)) + elif i == GGML_TYPE_Q5_0: + stream.extend((shader_q5_0_defines, shader_q5_0_dequant_func, dequant_body)) + elif i == GGML_TYPE_Q5_1: + stream.extend((shader_q5_1_defines, shader_q5_1_dequant_func, dequant_body)) + elif i == GGML_TYPE_Q8_0: + stream.extend((shader_q8_0_defines, shader_q8_0_dequant_func, dequant_body)) + elif i == GGML_TYPE_Q2_K: + stream.extend((shader_q2_K_defines, dequant_q2_K_body)) + elif i == GGML_TYPE_Q3_K: + stream.extend((shader_q3_K_defines, dequant_q3_K_body)) + elif i == GGML_TYPE_Q4_K: + stream.extend((shader_q4_K_defines, dequant_q4_K_body)) + elif i == GGML_TYPE_Q5_K: + stream.extend((shader_q5_K_defines, dequant_q5_K_body)) + elif i == GGML_TYPE_Q6_K: + stream.extend((shader_q6_K_defines, dequant_q6_K_body)) + else: + continue + + tasks.append(string_to_spv(f"dequant_{type_names[i]}", "".join(stream), {"D_TYPE": "float16_t"})) + + tasks.append(string_to_spv("f32_to_f16", f32_to_f16_src, {})) + + # get_rows + for i in range(0, VK_NUM_TYPES): + stream.clear() + stream.extend((generic_head, shader_int8_ext, shader_f32)) + + if i == GGML_TYPE_F16: + stream.extend((shader_f16_defines, shader_f16_dequant_func, get_rows_body)) + elif i == GGML_TYPE_Q4_0: + stream.extend((shader_q4_0_defines, shader_q4_0_dequant_func, get_rows_body)) + elif i == GGML_TYPE_Q4_1: + stream.extend((shader_q4_1_defines, shader_q4_1_dequant_func, get_rows_body)) + elif i == GGML_TYPE_Q5_0: + stream.extend((shader_q5_0_defines, shader_q5_0_dequant_func, get_rows_body)) + elif i == GGML_TYPE_Q5_1: + stream.extend((shader_q5_1_defines, shader_q5_1_dequant_func, get_rows_body)) + elif i == GGML_TYPE_Q8_0: + stream.extend((shader_q8_0_defines, shader_q8_0_dequant_func, get_rows_body)) + else: + continue + + tasks.append(string_to_spv(f"get_rows_{type_names[i]}", "".join(stream), {"B_TYPE": "float", "D_TYPE": "float16_t"})) + tasks.append(string_to_spv(f"get_rows_{type_names[i]}_f32", "".join(stream), {"B_TYPE": "float", "D_TYPE": "float"})) + + tasks.append(string_to_spv("mul_mat_vec_p021_f16_f32", mul_mat_p021_src, {"A_TYPE": "float16_t", "B_TYPE": "float", "D_TYPE": "float"})) + tasks.append(string_to_spv("mul_mat_vec_nc_f16_f32", mul_mat_nc_src, {"A_TYPE": "float16_t", "B_TYPE": "float", "D_TYPE": "float"})) # Norms - tasks.append(string_to_spv("norm_f32", f"{generic_head}\n{shader_f32}\n{norm_body}", {"A_TYPE": "float", "D_TYPE": "float"}, True)) - tasks.append(string_to_spv("rms_norm_f32", f"{generic_head}\n{shader_f32}\n{rms_norm_body}", {"A_TYPE": "float", "D_TYPE": "float"}, True)) + tasks.append(string_to_spv("norm_f32", f"{generic_head}\n{shader_f32}\n{norm_body}", {"A_TYPE": "float", "D_TYPE": "float"})) + tasks.append(string_to_spv("rms_norm_f32", f"{generic_head}\n{shader_f32}\n{rms_norm_body}", {"A_TYPE": "float", "D_TYPE": "float"})) - tasks.append(string_to_spv("cpy_f32_f32", f"{cpy_src}\n{cpy_end}", {"A_TYPE": "float", "D_TYPE": "float"}, True)) - tasks.append(string_to_spv("cpy_f32_f16", f"{cpy_src}\n{cpy_end}", {"A_TYPE": "float", "D_TYPE": "float16_t"}, True)) - tasks.append(string_to_spv("cpy_f16_f16", f"{cpy_src}\n{cpy_f16_f16_end}", {"A_TYPE": "float16_t", "D_TYPE": "float16_t"}, True)) + tasks.append(string_to_spv("cpy_f32_f32", f"{cpy_src}\n{cpy_end}", {"A_TYPE": "float", "D_TYPE": "float"})) + tasks.append(string_to_spv("cpy_f32_f16", f"{cpy_src}\n{cpy_end}", {"A_TYPE": "float", "D_TYPE": "float16_t"})) + tasks.append(string_to_spv("cpy_f16_f16", f"{cpy_src}\n{cpy_f16_f16_end}", {"A_TYPE": "float16_t", "D_TYPE": "float16_t"})) - tasks.append(string_to_spv("add_f32", f"{generic_head}\n{shader_f32}\n{add_body}", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float"}, True)) + tasks.append(string_to_spv("add_f32", f"{generic_head}\n{shader_f32}\n{add_body}", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float"})) - tasks.append(string_to_spv("split_k_reduce", mulmat_split_k_reduce_src, {}, True)) - tasks.append(string_to_spv("mul_f32", f"{generic_head}\n{shader_f32}\n{mul_body}", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float"}, True)) + tasks.append(string_to_spv("split_k_reduce", mulmat_split_k_reduce_src, {})) + tasks.append(string_to_spv("mul_f32", f"{generic_head}\n{shader_f32}\n{mul_body}", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float"})) - tasks.append(string_to_spv("scale_f32", f"{generic_head}\n{shader_f32}\n{scale_body}", {"A_TYPE": "float", "D_TYPE": "float"}, True)) + tasks.append(string_to_spv("scale_f32", f"{generic_head}\n{shader_f32}\n{scale_body}", {"A_TYPE": "float", "D_TYPE": "float"})) - tasks.append(string_to_spv("sqr_f32", f"{generic_head}\n{shader_f32}\n{sqr_body}", {"A_TYPE": "float", "D_TYPE": "float"}, True)) + tasks.append(string_to_spv("sqr_f32", f"{generic_head}\n{shader_f32}\n{sqr_body}", {"A_TYPE": "float", "D_TYPE": "float"})) - tasks.append(string_to_spv("clamp_f32", f"{generic_head}\n{shader_f32}\n{clamp_body}", {"A_TYPE": "float", "D_TYPE": "float"}, True)) + tasks.append(string_to_spv("clamp_f32", f"{generic_head}\n{shader_f32}\n{clamp_body}", {"A_TYPE": "float", "D_TYPE": "float"})) - tasks.append(string_to_spv("gelu_f32", f"{generic_head}\n{shader_f32}\n{gelu_body}", {"A_TYPE": "float", "D_TYPE": "float"}, True)) - tasks.append(string_to_spv("silu_f32", f"{generic_head}\n{shader_f32}\n{silu_body}", {"A_TYPE": "float", "D_TYPE": "float"}, True)) - tasks.append(string_to_spv("relu_f32", f"{generic_head}\n{shader_f32}\n{relu_body}", {"A_TYPE": "float", "D_TYPE": "float"}, True)) + tasks.append(string_to_spv("gelu_f32", f"{generic_head}\n{shader_f32}\n{gelu_body}", {"A_TYPE": "float", "D_TYPE": "float"})) + tasks.append(string_to_spv("silu_f32", f"{generic_head}\n{shader_f32}\n{silu_body}", {"A_TYPE": "float", "D_TYPE": "float"})) + tasks.append(string_to_spv("relu_f32", f"{generic_head}\n{shader_f32}\n{relu_body}", {"A_TYPE": "float", "D_TYPE": "float"})) - tasks.append(string_to_spv("diag_mask_inf_f32", f"{diag_mask_inf_head}\n{shader_f32}\n{diag_mask_inf_body}", {"A_TYPE": "float", "D_TYPE": "float"}, True)) + tasks.append(string_to_spv("diag_mask_inf_f32", f"{diag_mask_inf_head}\n{shader_f32}\n{diag_mask_inf_body}", {"A_TYPE": "float", "D_TYPE": "float"})) - tasks.append(string_to_spv("soft_max_f32", f"{generic_head}\n{shader_f32}\n{soft_max_body}", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float"}, True)) + tasks.append(string_to_spv("soft_max_f32", f"{generic_head}\n{shader_f32}\n{soft_max_body}", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float"})) - tasks.append(string_to_spv("rope_f32", rope_src, {"A_TYPE": "float", "D_TYPE": "float"}, True)) - tasks.append(string_to_spv("rope_f16", rope_src, {"A_TYPE": "float16_t", "D_TYPE": "float16_t"}, True)) + tasks.append(string_to_spv("rope_f32", rope_src, {"A_TYPE": "float", "D_TYPE": "float"})) + tasks.append(string_to_spv("rope_f16", rope_src, {"A_TYPE": "float16_t", "D_TYPE": "float16_t"})) - tasks.append(string_to_spv("rope_neox_f32", rope_neox_src, {"A_TYPE": "float", "D_TYPE": "float"}, True)) - tasks.append(string_to_spv("rope_neox_f16", rope_neox_src, {"A_TYPE": "float16_t", "D_TYPE": "float16_t"}, True)) + tasks.append(string_to_spv("rope_neox_f32", rope_neox_src, {"A_TYPE": "float", "D_TYPE": "float"})) + tasks.append(string_to_spv("rope_neox_f16", rope_neox_src, {"A_TYPE": "float16_t", "D_TYPE": "float16_t"})) await asyncio.gather(*tasks) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index f5c933a41..ed8e26f83 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -72,6 +72,7 @@ class Keys: PAD_ID = "tokenizer.ggml.padding_token_id" ADD_BOS = "tokenizer.ggml.add_bos_token" ADD_EOS = "tokenizer.ggml.add_eos_token" + ADD_PREFIX = "tokenizer.ggml.add_space_prefix" HF_JSON = "tokenizer.huggingface.json" RWKV = "tokenizer.rwkv.world" CHAT_TEMPLATE = "tokenizer.chat_template" @@ -102,6 +103,7 @@ class MODEL_ARCH(IntEnum): PLAMO = auto() CODESHELL = auto() ORION = auto() + INTERNLM2 = auto() class MODEL_TENSOR(IntEnum): @@ -153,6 +155,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.PLAMO: "plamo", MODEL_ARCH.CODESHELL: "codeshell", MODEL_ARCH.ORION: "orion", + MODEL_ARCH.INTERNLM2: "internlm2", } TENSOR_NAMES: dict[MODEL_TENSOR, str] = { @@ -446,6 +449,21 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, ], + MODEL_ARCH.INTERNLM2: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.ATTN_ROT_EMBD, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ], # TODO } diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index d93aaa877..16808196e 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -411,6 +411,9 @@ class GGUFWriter: def add_add_eos_token(self, value: bool) -> None: self.add_bool(Keys.Tokenizer.ADD_EOS, value) + def add_add_space_prefix(self, value: bool) -> None: + self.add_bool(Keys.Tokenizer.ADD_PREFIX, value) + def add_chat_template(self, value: str) -> None: self.add_string(Keys.Tokenizer.CHAT_TEMPLATE, value) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index de177af13..4f16d8504 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -19,6 +19,7 @@ class TensorNameMap: "language_model.embedding.word_embeddings", # persimmon "wte", # gpt2 "transformer.embd.wte", # phi2 + "model.tok_embeddings", # internlm2 ), # Token type embeddings @@ -42,7 +43,7 @@ class TensorNameMap: MODEL_TENSOR.OUTPUT: ( "embed_out", # gptneox "lm_head", # gpt2 mpt falcon llama-hf baichuan qwen - "output", # llama-pth bloom + "output", # llama-pth bloom internlm2 "word_embeddings_for_head", # persimmon "lm_head.linear", # phi2 ), @@ -51,7 +52,7 @@ class TensorNameMap: MODEL_TENSOR.OUTPUT_NORM: ( "gpt_neox.final_layer_norm", # gptneox "transformer.ln_f", # gpt2 gpt-j falcon - "model.norm", # llama-hf baichuan + "model.norm", # llama-hf baichuan internlm2 "norm", # llama-pth "embeddings.LayerNorm", # bert "transformer.norm_f", # mpt @@ -84,6 +85,7 @@ class TensorNameMap: "h.{bid}.ln_1", # gpt2 "transformer.h.{bid}.ln", # phi2 "model.layers.layers.{bid}.norm", # plamo + "model.layers.{bid}.attention_norm", # internlm2 ), # Attention norm 2 @@ -111,6 +113,7 @@ class TensorNameMap: "encoder.layer.{bid}.attention.self.query", # bert "transformer.h.{bid}.attn.q_proj", # gpt-j "model.layers.layers.{bid}.self_attn.q_proj", # plamo + "model.layers.{bid}.attention.wq" # internlm2 ), # Attention key @@ -120,6 +123,7 @@ class TensorNameMap: "encoder.layer.{bid}.attention.self.key", # bert "transformer.h.{bid}.attn.k_proj", # gpt-j "model.layers.layers.{bid}.self_attn.k_proj", # plamo + "model.layers.{bid}.attention.wk" # internlm2 ), # Attention value @@ -129,6 +133,7 @@ class TensorNameMap: "encoder.layer.{bid}.attention.self.value", # bert "transformer.h.{bid}.attn.v_proj", # gpt-j "model.layers.layers.{bid}.self_attn.v_proj", # plamo + "model.layers.{bid}.attention.wv" # internlm2 ), # Attention output @@ -147,6 +152,7 @@ class TensorNameMap: "h.{bid}.attn.c_proj", # gpt2 "transformer.h.{bid}.mixer.out_proj", # phi2 "model.layers.layers.{bid}.self_attn.o_proj", # plamo + "model.layers.{bid}.attention.wo", # internlm2 ), # Rotary embeddings @@ -169,6 +175,7 @@ class TensorNameMap: "language_model.encoder.layers.{bid}.post_attention_layernorm", # persimmon "model.layers.{bid}.ln2", # yi "h.{bid}.ln_2", # gpt2 + "model.layers.{bid}.ffn_norm", # internlm2 ), MODEL_TENSOR.FFN_GATE_INP: ( @@ -194,6 +201,7 @@ class TensorNameMap: "transformer.h.{bid}.mlp.fc1", # phi2 "model.layers.{bid}.mlp.fc1", # phi2 "model.layers.layers.{bid}.mlp.up_proj", # plamo + "model.layers.{bid}.feed_forward.w3", # internlm2 ), MODEL_TENSOR.FFN_UP_EXP: ( @@ -212,6 +220,7 @@ class TensorNameMap: "layers.{bid}.feed_forward.w1", # llama-pth "transformer.h.{bid}.mlp.w2", # qwen "model.layers.layers.{bid}.mlp.gate_proj", # plamo + "model.layers.{bid}.feed_forward.w1", # internlm2 ), MODEL_TENSOR.FFN_GATE_EXP: ( @@ -236,6 +245,7 @@ class TensorNameMap: "transformer.h.{bid}.mlp.fc2", # phi2 "model.layers.{bid}.mlp.fc2", # phi2 "model.layers.layers.{bid}.mlp.down_proj", # plamo + "model.layers.{bid}.feed_forward.w2", # internlm2 ), MODEL_TENSOR.FFN_DOWN_EXP: ( diff --git a/llama.cpp b/llama.cpp index f5d59951c..884f5f5a7 100644 --- a/llama.cpp +++ b/llama.cpp @@ -228,10 +228,11 @@ enum llm_arch { LLM_ARCH_PLAMO, LLM_ARCH_CODESHELL, LLM_ARCH_ORION, + LLM_ARCH_INTERNLM2, LLM_ARCH_UNKNOWN, }; -static std::map LLM_ARCH_NAMES = { +static std::map LLM_ARCH_NAMES = { { LLM_ARCH_LLAMA, "llama" }, { LLM_ARCH_FALCON, "falcon" }, { LLM_ARCH_GPT2, "gpt2" }, @@ -250,6 +251,7 @@ static std::map LLM_ARCH_NAMES = { { LLM_ARCH_PLAMO, "plamo" }, { LLM_ARCH_CODESHELL, "codeshell" }, { LLM_ARCH_ORION, "orion" }, + { LLM_ARCH_INTERNLM2, "internlm2" }, }; enum llm_kv { @@ -302,11 +304,12 @@ enum llm_kv { LLM_KV_TOKENIZER_PAD_ID, LLM_KV_TOKENIZER_ADD_BOS, LLM_KV_TOKENIZER_ADD_EOS, + LLM_KV_TOKENIZER_ADD_PREFIX, LLM_KV_TOKENIZER_HF_JSON, LLM_KV_TOKENIZER_RWKV, }; -static std::map LLM_KV_NAMES = { +static std::map LLM_KV_NAMES = { { LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" }, { LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" }, { LLM_KV_GENERAL_ALIGNMENT, "general.alignment" }, @@ -356,6 +359,7 @@ static std::map LLM_KV_NAMES = { { LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" }, { LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" }, { LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" }, + { LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" }, { LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" }, { LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" }, }; @@ -366,7 +370,7 @@ struct LLM_KV { llm_arch arch; std::string operator()(llm_kv kv) const { - return ::format(LLM_KV_NAMES[kv].c_str(), LLM_ARCH_NAMES[arch].c_str()); + return ::format(LLM_KV_NAMES[kv], LLM_ARCH_NAMES[arch]); } }; @@ -693,7 +697,23 @@ static std::map> LLM_TENSOR_NAMES = { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, }, }, - + { + LLM_ARCH_INTERNLM2, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + }, + }, { LLM_ARCH_UNKNOWN, { @@ -751,13 +771,13 @@ struct LLM_TN { // gguf helpers // -static std::map LLAMA_ROPE_SCALING_TYPES = { +static std::map LLAMA_ROPE_SCALING_TYPES = { { LLAMA_ROPE_SCALING_NONE, "none" }, { LLAMA_ROPE_SCALING_LINEAR, "linear" }, { LLAMA_ROPE_SCALING_YARN, "yarn" }, }; -static int8_t llama_rope_scaling_type_from_string(const std::string & name) { +static int32_t llama_rope_scaling_type_from_string(const std::string & name) { for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) { if (kv.second == name) { return kv.first; @@ -1405,6 +1425,7 @@ enum e_model { MODEL_13B, MODEL_14B, MODEL_15B, + MODEL_20B, MODEL_30B, MODEL_34B, MODEL_40B, @@ -1422,6 +1443,7 @@ static const size_t GiB = 1024*MiB; struct llama_hparams { bool vocab_only; + bool rope_finetuned; uint32_t n_vocab; uint32_t n_ctx_train; // context size the model was trained on uint32_t n_embd; @@ -1441,8 +1463,7 @@ struct llama_hparams { float rope_freq_base_train; float rope_freq_scale_train; uint32_t n_yarn_orig_ctx; - int8_t rope_scaling_type_train : 3; - bool rope_finetuned : 1; + int32_t rope_scaling_type_train; float f_clamp_kqv; float f_max_alibi_bias; @@ -1646,6 +1667,7 @@ struct llama_vocab { id special_suffix_id = 32008; id special_eot_id = 32010; + bool add_space_prefix = true; int find_bpe_rank(std::string token_left, std::string token_right) const { // GGML_ASSERT(token_left.find(" ") == std::string::npos); // GGML_ASSERT(token_left.find("\n") == std::string::npos); @@ -2725,7 +2747,7 @@ struct llama_model_loader { // load LLaMA models // -static std::string llama_model_arch_name(llm_arch arch) { +static const char * llama_model_arch_name(llm_arch arch) { auto it = LLM_ARCH_NAMES.find(arch); if (it == LLM_ARCH_NAMES.end()) { return "unknown"; @@ -2778,6 +2800,7 @@ static const char * llama_model_type_name(e_model type) { case MODEL_13B: return "13B"; case MODEL_14B: return "14B"; case MODEL_15B: return "15B"; + case MODEL_20B: return "20B"; case MODEL_30B: return "30B"; case MODEL_34B: return "34B"; case MODEL_40B: return "40B"; @@ -2790,6 +2813,14 @@ static const char * llama_model_type_name(e_model type) { default: return "?B"; } } +static const char * llama_model_vocab_type_name(enum llama_vocab_type type){ + switch (type) { + case LLAMA_VOCAB_TYPE_SPM: return "SPM"; + case LLAMA_VOCAB_TYPE_BPE: return "BPE"; + default: return "unknown"; + } +} + static void llm_load_arch(llama_model_loader & ml, llama_model & model) { model.arch = ml.get_arch(); @@ -3053,6 +3084,15 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; + case LLM_ARCH_INTERNLM2: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + switch (hparams.n_layer) { + case 32: model.type = e_model::MODEL_7B; break; + case 48: model.type = e_model::MODEL_20B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; default: (void)0; } @@ -3105,6 +3145,11 @@ static void llm_load_vocab( vocab.special_unk_id = 0; vocab.special_sep_id = -1; vocab.special_pad_id = -1; + + const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str()); + if (add_space_prefix_keyidx != -1) { + vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx); + } // The default value of add_space_prefix is true. } else if (tokenizer_name == "gpt2") { vocab.type = LLAMA_VOCAB_TYPE_BPE; @@ -3330,12 +3375,12 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { const auto & hparams = model.hparams; const auto & vocab = model.vocab; - const auto rope_scaling_type = LLAMA_ROPE_SCALING_TYPES.at(hparams.rope_scaling_type_train); + const char * rope_scaling_type = LLAMA_ROPE_SCALING_TYPES.at(hparams.rope_scaling_type_train); // hparams LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver)); - LLAMA_LOG_INFO("%s: arch = %s\n", __func__, LLM_ARCH_NAMES.at(model.arch).c_str()); - LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, vocab.type == LLAMA_VOCAB_TYPE_SPM ? "SPM" : "BPE"); // TODO: fix + LLAMA_LOG_INFO("%s: arch = %s\n", __func__, LLM_ARCH_NAMES.at(model.arch)); + LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, llama_model_vocab_type_name(vocab.type)); LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab); LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (int) vocab.bpe_ranks.size()); LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train); @@ -3356,7 +3401,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff); LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert); LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used); - LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str()); + LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type); LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train); LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train); LLAMA_LOG_INFO("%s: n_yarn_orig_ctx = %u\n", __func__, hparams.n_yarn_orig_ctx); @@ -4091,8 +4136,35 @@ static bool llm_load_tensors( layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); } } break; + case LLM_ARCH_INTERNLM2: + { + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + // output + { + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); + } + for (int i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); + + auto & layer = model.layers[i]; + + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + // layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}); + layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}); + layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}); + layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}); + + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); + } + } break; default: throw std::runtime_error("unknown architecture"); } @@ -4739,126 +4811,6 @@ struct llm_build_context { ctx0 = nullptr; } } - struct ggml_cgraph * build_orion() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); - - const int64_t n_embd_head = hparams.n_embd_head_v; - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); - - struct ggml_tensor * cur; - struct ggml_tensor * inpL; - - inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); - cb(inpL, "inp_embd", -1); - - // inp_pos - contains the positions - struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0); - cb(inp_pos, "inp_pos", -1); - - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); - cb(KQ_mask, "KQ_mask", -1); - - // shift the entire K-cache if needed - if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb); - } - - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; - - // norm - cur = llm_build_norm(ctx0, inpL, hparams, - model.layers[il].attn_norm, model.layers[il].attn_norm_b, - LLM_NORM, cb, il); - cb(cur, "attn_norm", il); - - // self-attention - { - // compute Q and K and RoPE them - struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); - cb(Qcur, "Qcur", il); - // if (model.layers[il].bq) { - // Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); - // cb(Qcur, "Qcur", il); - // } - - struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); - cb(Kcur, "Kcur", il); - // if (model.layers[il].bk) { - // Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); - // cb(Kcur, "Kcur", il); - // } - - struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); - cb(Vcur, "Vcur", il); - // if (model.layers[il].bv) { - // Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); - // cb(Vcur, "Vcur", il); - // } - - Qcur = ggml_rope_custom( - ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, - hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(Qcur, "Qcur", il); - - Kcur = ggml_rope_custom( - ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, - hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(Kcur, "Kcur", il); - - cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, - model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); - cb(cur, "kqv_out", il); - } - - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); - cb(ffn_inp, "ffn_inp", il); - - // feed-forward network - cur = llm_build_norm(ctx0, ffn_inp, hparams, - model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, - LLM_NORM, cb, il); - cb(cur, "ffn_norm", il); - - cur = llm_build_ffn(ctx0, cur, - model.layers[il].ffn_up, NULL, - model.layers[il].ffn_gate, NULL, - model.layers[il].ffn_down, NULL, - NULL, - LLM_FFN_SILU, LLM_FFN_PAR, cb, il); - cb(cur, "ffn_out", il); - - cur = ggml_add(ctx0, cur, ffn_inp); - cb(cur, "l_out", il); - - // input for next layer - inpL = cur; - } - - cur = inpL; - - cur = llm_build_norm(ctx0, cur, hparams, - model.output_norm, model.output_norm_b, - LLM_NORM, cb, -1); - cb(cur, "result_norm", -1); - - // lm_head - cur = ggml_mul_mat(ctx0, model.output, cur); - cb(cur, "result_output", -1); - - ggml_build_forward_expand(gf, cur); - - return gf; - } - - struct ggml_cgraph * build_llama() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); @@ -6662,6 +6614,245 @@ struct llm_build_context { return gf; } + + struct ggml_cgraph * build_orion() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + + const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_rot); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); + cb(inpL, "inp_embd", -1); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0); + cb(inp_pos, "inp_pos", -1); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); + cb(KQ_mask, "KQ_mask", -1); + + // shift the entire K-cache if needed + if (do_rope_shift) { + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb); + } + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + + // norm + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, model.layers[il].attn_norm_b, + LLM_NORM, cb, il); + cb(cur, "attn_norm", il); + + // self-attention + { + // compute Q and K and RoPE them + struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + // if (model.layers[il].bq) { + // Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); + // cb(Qcur, "Qcur", il); + // } + + struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + // if (model.layers[il].bk) { + // Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); + // cb(Kcur, "Kcur", il); + // } + + struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + // if (model.layers[il].bv) { + // Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); + // cb(Vcur, "Vcur", il); + // } + + Qcur = ggml_rope_custom( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, + hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); + + Kcur = ggml_rope_custom( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, + hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); + + cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, + model.layers[il].wo, NULL, + Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); + cb(cur, "kqv_out", il); + } + + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, + LLM_NORM, cb, il); + cb(cur, "ffn_norm", il); + + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, NULL, + model.layers[il].ffn_gate, NULL, + model.layers[il].ffn_down, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + cb(cur, "ffn_out", il); + + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm, model.output_norm_b, + LLM_NORM, cb, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = ggml_mul_mat(ctx0, model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + + struct ggml_cgraph * build_internlm2() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + + const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_rot); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); + cb(inpL, "inp_embd", -1); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0); + cb(inp_pos, "inp_pos", -1); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); + cb(KQ_mask, "KQ_mask", -1); + + // shift the entire K-cache if needed + if (do_rope_shift) { + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb); + } + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + + // norm + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "attn_norm", il); + + // self-attention + { + // compute Q and K and RoPE them + struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + if (model.layers[il].bq) { + Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); + cb(Qcur, "Qcur", il); + } + + struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + if (model.layers[il].bk) { + Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); + cb(Kcur, "Kcur", il); + } + + struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + if (model.layers[il].bv) { + Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); + cb(Vcur, "Vcur", il); + } + + Qcur = ggml_rope_custom( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, + hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); + + Kcur = ggml_rope_custom( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, + hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); + + cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, + model.layers[il].wo, model.layers[il].bo, + Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); + cb(cur, "kqv_out", il); + } + + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "ffn_norm", il); + + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, NULL, + model.layers[il].ffn_gate, NULL, + model.layers[il].ffn_down, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + cb(cur, "ffn_out", il); + + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm, NULL, + LLM_NORM_RMS, cb, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = ggml_mul_mat(ctx0, model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + }; static struct ggml_cgraph * llama_build_graph( @@ -6820,6 +7011,10 @@ static struct ggml_cgraph * llama_build_graph( { result = llm.build_orion(); } break; + case LLM_ARCH_INTERNLM2: + { + result = llm.build_internlm2(); + } break; default: GGML_ASSERT(false); } @@ -7986,7 +8181,9 @@ static std::vector llama_tokenize_internal(const llama_vocab & // auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length); if (&fragment == &fragment_buffer.front()) { - raw_text = " " + raw_text; // prefix with space if the first token is not special + if (vocab.add_space_prefix) { + raw_text = " " + raw_text; // prefix with space if the first token is not special + } } #ifdef PRETOKENIZERDEBUG @@ -10413,18 +10610,37 @@ struct llama_model_quantize_params llama_model_quantize_default_params() { return result; } -int32_t llama_max_devices(void) { - return LLAMA_MAX_DEVICES; +size_t llama_max_devices(void) { + return 16; } -bool llama_mmap_supported(void) { +bool llama_supports_mmap(void) { return llama_mmap::SUPPORTED; } -bool llama_mlock_supported(void) { +bool llama_supports_mlock(void) { return llama_mlock::SUPPORTED; } +bool llama_supports_gpu_offload(void) { +#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \ + defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) + // Defined when llama.cpp is compiled with support for offloading model layers to GPU. + return true; +#else + return false; +#endif +} + +// deprecated: +bool llama_mmap_supported(void) { + return llama_supports_mmap(); +} + +bool llama_mlock_supported(void) { + return llama_supports_mlock(); +} + void llama_backend_init(bool numa) { ggml_time_init(); @@ -10456,8 +10672,8 @@ int64_t llama_time_us(void) { } struct llama_model * llama_load_model_from_file( - const char * path_model, - struct llama_model_params params) { + const char * path_model, + struct llama_model_params params) { ggml_time_init(); llama_model * model = new llama_model; @@ -10833,7 +11049,7 @@ int32_t llama_model_meta_val_str_by_index(const struct llama_model * model, int3 int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) { return snprintf(buf, buf_size, "%s %s %s", - llama_model_arch_name(model->arch).c_str(), + llama_model_arch_name(model->arch), llama_model_type_name(model->type), llama_model_ftype_name(model->ftype).c_str()); } @@ -11485,22 +11701,24 @@ struct llama_batch llama_batch_get_one( }; } -struct llama_batch llama_batch_init(int32_t n_tokens, int32_t embd, int32_t n_seq_max) { +struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_t n_seq_max) { llama_batch batch = { 0, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, }; if (embd) { - batch.embd = (float *) malloc(sizeof(float) * n_tokens * embd); + batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd); } else { - batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens); + batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc); } - batch.pos = (llama_pos *) malloc(sizeof(llama_pos) * n_tokens); - batch.n_seq_id = (int32_t *) malloc(sizeof(int32_t) * n_tokens); - batch.seq_id = (llama_seq_id **) malloc(sizeof(llama_seq_id *) * n_tokens); - for (int i = 0; i < n_tokens; ++i) { + batch.pos = (llama_pos *) malloc(sizeof(llama_pos) * n_tokens_alloc); + batch.n_seq_id = (int32_t *) malloc(sizeof(int32_t) * n_tokens_alloc); + batch.seq_id = (llama_seq_id **) malloc(sizeof(llama_seq_id *) * (n_tokens_alloc + 1)); + for (int i = 0; i < n_tokens_alloc; ++i) { batch.seq_id[i] = (llama_seq_id *) malloc(sizeof(llama_seq_id) * n_seq_max); } - batch.logits = (int8_t *) malloc(sizeof(int8_t) * n_tokens); + batch.seq_id[n_tokens_alloc] = nullptr; + + batch.logits = (int8_t *) malloc(sizeof(int8_t) * n_tokens_alloc); return batch; } @@ -11511,7 +11729,7 @@ void llama_batch_free(struct llama_batch batch) { if (batch.pos) free(batch.pos); if (batch.n_seq_id) free(batch.n_seq_id); if (batch.seq_id) { - for (int i = 0; i < batch.n_tokens; ++i) { + for (int i = 0; batch.seq_id[i] != nullptr; ++i) { free(batch.seq_id[i]); } free(batch.seq_id); diff --git a/llama.h b/llama.h index 8c01f5bf4..0b0109eb3 100644 --- a/llama.h +++ b/llama.h @@ -3,16 +3,7 @@ #include "ggml.h" #include "ggml-backend.h" -#ifdef GGML_USE_CUBLAS -#include "ggml-cuda.h" -#define LLAMA_MAX_DEVICES 16 -#elif defined(GGML_USE_SYCL) -#include "ggml-sycl.h" -#define LLAMA_MAX_DEVICES GGML_SYCL_MAX_DEVICES -#else -//just max it out, same as GGML_CUDA_MAX_DEVICES -#define LLAMA_MAX_DEVICES 16 -#endif // GGML_USE_CUBLAS + #include #include #include @@ -50,12 +41,6 @@ #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN #define LLAMA_SESSION_VERSION 4 -#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \ - defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) -// Defined when llama.cpp is compiled with support for offloading model layers to GPU. -#define LLAMA_SUPPORTS_GPU_OFFLOAD -#endif - #ifdef __cplusplus extern "C" { #endif @@ -202,7 +187,7 @@ extern "C" { // LLAMA_SPLIT_LAYER: ignored int32_t main_gpu; - // proportion of the model (layers or rows) to offload to each GPU, size: LLAMA_MAX_DEVICES + // proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices() const float * tensor_split; // Called with a progress value between 0.0 and 1.0. Pass NULL to disable. @@ -228,7 +213,7 @@ extern "C" { uint32_t n_batch; // prompt processing maximum batch size uint32_t n_threads; // number of threads to use for generation uint32_t n_threads_batch; // number of threads to use for batch processing - int8_t rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type` + int32_t rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type` // ref: https://github.com/ggerganov/llama.cpp/pull/2054 float rope_freq_base; // RoPE base frequency, 0 = from model @@ -339,9 +324,14 @@ extern "C" { LLAMA_API int64_t llama_time_us(void); - LLAMA_API int32_t llama_max_devices(void); - LLAMA_API bool llama_mmap_supported (void); - LLAMA_API bool llama_mlock_supported(void); + LLAMA_API size_t llama_max_devices(void); + + LLAMA_API bool llama_supports_mmap (void); + LLAMA_API bool llama_supports_mlock (void); + LLAMA_API bool llama_supports_gpu_offload(void); + + LLAMA_API DEPRECATED(bool llama_mmap_supported (void), "use llama_supports_mmap() instead"); + LLAMA_API DEPRECATED(bool llama_mlock_supported(void), "use llama_supports_mlock() instead"); LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx); diff --git a/scripts/server-llm.sh b/scripts/server-llm.sh index 7bf0929bb..0b83cdbbc 100644 --- a/scripts/server-llm.sh +++ b/scripts/server-llm.sh @@ -141,6 +141,28 @@ for wt in "${wtypes[@]}"; do wfiles+=("") done +# map wtype input to index +if [[ ! -z "$wtype" ]]; then + iw=-1 + is=0 + for wt in "${wtypes[@]}"; do + # uppercase + uwt=$(echo "$wt" | tr '[:lower:]' '[:upper:]') + if [[ "$uwt" == "$wtype" ]]; then + iw=$is + break + fi + is=$((is+1)) + done + + if [[ $iw -eq -1 ]]; then + printf "[-] Invalid weight type: %s\n" "$wtype" + exit 1 + fi + + wtype="$iw" +fi + # sample repos repos=( "https://huggingface.co/TheBloke/Llama-2-7B-GGUF" @@ -252,8 +274,10 @@ for file in $model_files; do printf " %2d) %s %s\n" $iw "$have" "$file" done +wfile="${wfiles[$wtype]}" + # ask for weights type until provided and available -while [[ -z "$wtype" ]]; do +while [[ -z "$wfile" ]]; do printf "\n" read -p "[+] Select weight type: " wtype wfile="${wfiles[$wtype]}" diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 1d29070b6..eb06123d2 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -227,6 +227,14 @@ static std::string var_to_str(ggml_type type) { return ggml_type_name(type); } +static std::string var_to_str(ggml_op_pool pool) { + switch (pool) { + case GGML_OP_POOL_AVG: return "avg"; + case GGML_OP_POOL_MAX: return "max"; + default: return std::to_string(pool); + } +} + #define VARS_TO_STR1(a) VAR_TO_STR(a) #define VARS_TO_STR2(a, b) VAR_TO_STR(a) + "," + VAR_TO_STR(b) #define VARS_TO_STR3(a, b, c) VAR_TO_STR(a) + "," + VARS_TO_STR2(b, c) @@ -238,6 +246,7 @@ static std::string var_to_str(ggml_type type) { #define VARS_TO_STR9(a, b, c, d, e, f, g, h, i) VAR_TO_STR(a) + "," + VARS_TO_STR8(b, c, d, e, f, g, h, i) #define VARS_TO_STR10(a, b, c, d, e, f, g, h, i, j) VAR_TO_STR(a) + "," + VARS_TO_STR9(b, c, d, e, f, g, h, i, j) #define VARS_TO_STR11(a, b, c, d, e, f, g, h, i, j, k) VAR_TO_STR(a) + "," + VARS_TO_STR10(b, c, d, e, f, g, h, i, j, k) +#define VARS_TO_STR12(a, b, c, d, e, f, g, h, i, j, k, l) VAR_TO_STR(a) + "," + VARS_TO_STR11(b, c, d, e, f, g, h, i, j, k, l) #ifdef GGML_USE_SYCL static bool inline _isinf(float f) { @@ -1162,10 +1171,45 @@ struct test_alibi : public test_case { } }; +// GGML_OP_POOL2D +struct test_pool2d : public test_case { + enum ggml_op_pool pool_type; + const ggml_type type_input; + const std::array ne_input; + // kernel size + const int k0; + const int k1; + // stride + const int s0; + const int s1; + // padding + const int p0; + const int p1; + + std::string vars() override { + return VARS_TO_STR9(pool_type, type_input, ne_input, k0, k1, s0, s1, p0, p1); + } + + test_pool2d(ggml_op_pool pool_type = GGML_OP_POOL_AVG, + ggml_type type_input = GGML_TYPE_F32, + std::array ne_input = {10, 10, 3, 1}, // [input_width, input_height, input_channels, 1] + int k0 = 3, int k1 = 3, + int s0 = 1, int s1 = 1, + int p0 = 1, int p1 = 1) + : pool_type(pool_type), type_input(type_input), ne_input(ne_input), k0(k0), k1(k1), s0(s0), s1(s1), p0(p0), p1(p1) {} + + ggml_tensor * build_graph(ggml_context * ctx) override { + ggml_tensor * input = ggml_new_tensor(ctx, type_input, 4, ne_input.data()); + ggml_tensor * out = ggml_pool_2d(ctx, input, pool_type, k0, k1, s0, s1, p0, p1); + return out; + } +}; + // GGML_OP_IM2COL struct test_im2col : public test_case { const ggml_type type_input; const ggml_type type_kernel; + const ggml_type dst_type; const std::array ne_input; const std::array ne_kernel; // stride @@ -1181,22 +1225,22 @@ struct test_im2col : public test_case { const bool is_2D; std::string vars() override { - return VARS_TO_STR11(type_input, type_kernel, ne_input, ne_kernel, s0, s1, p0, p1, d0, d1, is_2D); + return VARS_TO_STR12(type_input, type_kernel, dst_type, ne_input, ne_kernel, s0, s1, p0, p1, d0, d1, is_2D); } - test_im2col(ggml_type type_input = GGML_TYPE_F32, ggml_type type_kernel = GGML_TYPE_F16, + test_im2col(ggml_type type_input = GGML_TYPE_F32, ggml_type type_kernel = GGML_TYPE_F16, ggml_type dst_type = GGML_TYPE_F32, std::array ne_input = {10, 10, 3, 1}, // [input_width, input_height, input_channels, 1] std::array ne_kernel = {3, 3, 3, 1}, // [kernel_width, kernel_height, input_channels, 1] int s0 = 1, int s1 = 1, int p0 = 1, int p1 = 1, int d0 = 1, int d1 = 1, bool is_2D = true) - : type_input(type_input), type_kernel(type_kernel), ne_input(ne_input), ne_kernel(ne_kernel), s0(s0), s1(s1), p0(p0), p1(p1), d0(d0), d1(d1), is_2D(is_2D) {} + : type_input(type_input), type_kernel(type_kernel), dst_type(dst_type), ne_input(ne_input), ne_kernel(ne_kernel), s0(s0), s1(s1), p0(p0), p1(p1), d0(d0), d1(d1), is_2D(is_2D) {} ggml_tensor * build_graph(ggml_context * ctx) override { ggml_tensor * input = ggml_new_tensor(ctx, type_input, 4, ne_input.data()); ggml_tensor * kernel = ggml_new_tensor(ctx, type_kernel, 4, ne_kernel.data()); - ggml_tensor * out = ggml_im2col(ctx, kernel, input, s0, s1, p0, p1, d0, d1, is_2D); + ggml_tensor * out = ggml_im2col(ctx, kernel, input, s0, s1, p0, p1, d0, d1, is_2D, dst_type); return out; } }; @@ -1912,6 +1956,27 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op } } + for (ggml_type type_input : {GGML_TYPE_F32}) { + for (ggml_op_pool pool_type : {GGML_OP_POOL_AVG, GGML_OP_POOL_MAX}) { + for (int k0 : {1, 3}) { + for (int k1 : {1, 3}) { + for (int s0 : {1, 2}) { + for (int s1 : {1, 2}) { + for (int p0 : {0, 1}) { + for (int p1 : {0, 1}) { + test_cases.emplace_back(new test_pool2d(pool_type, type_input, {10, 10, 3, 1}, k0, k1, s0, s1, p0, p1)); + } + } + } + } + } + } + } + } + + test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32)); + test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16)); + test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 1, 1, 1})); test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {2, 1, 1, 1})); test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 2, 1, 1})); @@ -2049,7 +2114,6 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op } test_cases.emplace_back(new test_alibi()); - test_cases.emplace_back(new test_im2col()); test_cases.emplace_back(new test_concat(GGML_TYPE_F32)); test_cases.emplace_back(new test_concat(GGML_TYPE_I32));