diff --git a/.devops/server-cuda.Dockerfile b/.devops/server-cuda.Dockerfile
new file mode 100644
index 000000000..4f83904bc
--- /dev/null
+++ b/.devops/server-cuda.Dockerfile
@@ -0,0 +1,32 @@
+ARG UBUNTU_VERSION=22.04
+# This needs to generally match the container host's environment.
+ARG CUDA_VERSION=11.7.1
+# Target the CUDA build image
+ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
+# Target the CUDA runtime image
+ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
+
+FROM ${BASE_CUDA_DEV_CONTAINER} as build
+
+# Unless otherwise specified, we make a fat build.
+ARG CUDA_DOCKER_ARCH=all
+
+RUN apt-get update && \
+ apt-get install -y build-essential git
+
+WORKDIR /app
+
+COPY . .
+
+# Set nvcc architecture
+ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
+# Enable cuBLAS
+ENV LLAMA_CUBLAS=1
+
+RUN make
+
+FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
+
+COPY --from=build /app/server /server
+
+ENTRYPOINT [ "/server" ]
diff --git a/.devops/server-intel.Dockerfile b/.devops/server-intel.Dockerfile
new file mode 100644
index 000000000..e343d278c
--- /dev/null
+++ b/.devops/server-intel.Dockerfile
@@ -0,0 +1,25 @@
+ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
+ARG UBUNTU_VERSION=22.04
+
+FROM intel/hpckit:$ONEAPI_VERSION as build
+
+RUN apt-get update && \
+ apt-get install -y git
+
+WORKDIR /app
+
+COPY . .
+
+# for some reasons, "-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DLLAMA_NATIVE=ON" give worse performance
+RUN mkdir build && \
+ cd build && \
+ cmake .. -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx && \
+ cmake --build . --config Release --target main server
+
+FROM ubuntu:$UBUNTU_VERSION as runtime
+
+COPY --from=build /app/build/bin/server /server
+
+ENV LC_ALL=C.utf8
+
+ENTRYPOINT [ "/server" ]
diff --git a/.devops/server-rocm.Dockerfile b/.devops/server-rocm.Dockerfile
new file mode 100644
index 000000000..e9a31647c
--- /dev/null
+++ b/.devops/server-rocm.Dockerfile
@@ -0,0 +1,45 @@
+ARG UBUNTU_VERSION=22.04
+
+# This needs to generally match the container host's environment.
+ARG ROCM_VERSION=5.6
+
+# Target the CUDA build image
+ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
+
+FROM ${BASE_ROCM_DEV_CONTAINER} as build
+
+# Unless otherwise specified, we make a fat build.
+# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
+# This is mostly tied to rocBLAS supported archs.
+ARG ROCM_DOCKER_ARCH=\
+ gfx803 \
+ gfx900 \
+ gfx906 \
+ gfx908 \
+ gfx90a \
+ gfx1010 \
+ gfx1030 \
+ gfx1100 \
+ gfx1101 \
+ gfx1102
+
+COPY requirements.txt requirements.txt
+COPY requirements requirements
+
+RUN pip install --upgrade pip setuptools wheel \
+ && pip install -r requirements.txt
+
+WORKDIR /app
+
+COPY . .
+
+# Set nvcc architecture
+ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
+# Enable ROCm
+ENV LLAMA_HIPBLAS=1
+ENV CC=/opt/rocm/llvm/bin/clang
+ENV CXX=/opt/rocm/llvm/bin/clang++
+
+RUN make
+
+ENTRYPOINT [ "/app/server" ]
diff --git a/.devops/server.Dockerfile b/.devops/server.Dockerfile
new file mode 100644
index 000000000..134588fe2
--- /dev/null
+++ b/.devops/server.Dockerfile
@@ -0,0 +1,20 @@
+ARG UBUNTU_VERSION=22.04
+
+FROM ubuntu:$UBUNTU_VERSION as build
+
+RUN apt-get update && \
+ apt-get install -y build-essential git
+
+WORKDIR /app
+
+COPY . .
+
+RUN make
+
+FROM ubuntu:$UBUNTU_VERSION as runtime
+
+COPY --from=build /app/server /server
+
+ENV LC_ALL=C.utf8
+
+ENTRYPOINT [ "/server" ]
diff --git a/README_sycl.md b/README_sycl.md
new file mode 100644
index 000000000..d5a1818f5
--- /dev/null
+++ b/README_sycl.md
@@ -0,0 +1,252 @@
+# llama.cpp for SYCL
+
+[Background](#background)
+
+[OS](#os)
+
+[Intel GPU](#intel-gpu)
+
+[Linux](#linux)
+
+[Environment Variable](#environment-variable)
+
+[Known Issue](#known-issue)
+
+[Todo](#todo)
+
+## Background
+
+SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators—such as CPUs, GPUs, and FPGAs. It is a single-source embedded domain-specific language based on pure C++17.
+
+oneAPI is a specification that is open and standards-based, supporting multiple architecture types including but not limited to GPU, CPU, and FPGA. The spec has both direct programming and API-based programming paradigms.
+
+Intel uses the SYCL as direct programming language to support CPU, GPUs and FPGAs.
+
+To avoid to re-invent the wheel, this code refer other code paths in llama.cpp (like OpenBLAS, cuBLAS, CLBlast). We use a open-source tool [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) migrate to SYCL.
+
+The llama.cpp for SYCL is used to support Intel GPUs.
+
+For Intel CPU, recommend to use llama.cpp for X86 (Intel MKL building).
+
+## OS
+
+|OS|Status|Verified|
+|-|-|-|
+|Linux|Support|Ubuntu 22.04|
+|Windows|Ongoing| |
+
+
+## Intel GPU
+
+|Intel GPU| Status | Verified Model|
+|-|-|-|
+|Intel Data Center Max Series| Support| Max 1550|
+|Intel Data Center Flex Series| Support| Flex 170|
+|Intel Arc Series| Support| Arc 770|
+|Intel built-in Arc GPU| Support| built-in Arc GPU in Meteor Lake|
+|Intel iGPU| Support| iGPU in i5-1250P, i7-1165G7|
+
+
+## Linux
+
+### Setup Environment
+
+1. Install Intel GPU driver.
+
+a. Please install Intel GPU driver by official guide: [Install GPU Drivers](https://dgpu-docs.intel.com/driver/installation.html).
+
+Note: for iGPU, please install the client GPU driver.
+
+b. Add user to group: video, render.
+
+```
+sudo usermod -aG render username
+sudo usermod -aG video username
+```
+
+Note: re-login to enable it.
+
+c. Check
+
+```
+sudo apt install clinfo
+sudo clinfo -l
+```
+
+Output (example):
+
+```
+Platform #0: Intel(R) OpenCL Graphics
+ `-- Device #0: Intel(R) Arc(TM) A770 Graphics
+
+
+Platform #0: Intel(R) OpenCL HD Graphics
+ `-- Device #0: Intel(R) Iris(R) Xe Graphics [0x9a49]
+```
+
+2. Install Intel® oneAPI Base toolkit.
+
+
+a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).
+
+Recommend to install to default folder: **/opt/intel/oneapi**.
+
+Following guide use the default folder as example. If you use other folder, please modify the following guide info with your folder.
+
+b. Check
+
+```
+source /opt/intel/oneapi/setvars.sh
+
+sycl-ls
+```
+
+There should be one or more level-zero devices. Like **[ext_oneapi_level_zero:gpu:0]**.
+
+Output (example):
+```
+[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.10.0.17_160000]
+[opencl:cpu:1] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i7-13700K OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000]
+[opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics OpenCL 3.0 NEO [23.30.26918.50]
+[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Arc(TM) A770 Graphics 1.3 [1.3.26918]
+
+```
+
+2. Build locally:
+
+```
+mkdir -p build
+cd build
+source /opt/intel/oneapi/setvars.sh
+
+#for FP16
+#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON # faster for long-prompt inference
+
+#for FP32
+cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+
+#build example/main only
+#cmake --build . --config Release --target main
+
+#build all binary
+cmake --build . --config Release -v
+
+```
+
+or
+
+```
+./examples/sycl/build.sh
+```
+
+Note:
+
+- By default, it will build for all binary files. It will take more time. To reduce the time, we recommend to build for **example/main** only.
+
+### Run
+
+1. Put model file to folder **models**
+
+2. Enable oneAPI running environment
+
+```
+source /opt/intel/oneapi/setvars.sh
+```
+
+3. List device ID
+
+Run without parameter:
+
+```
+./build/bin/ls-sycl-device
+
+or
+
+./build/bin/main
+```
+
+Check the ID in startup log, like:
+
+```
+found 4 SYCL devices:
+ Device 0: Intel(R) Arc(TM) A770 Graphics, compute capability 1.3,
+ max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
+ Device 1: Intel(R) FPGA Emulation Device, compute capability 1.2,
+ max compute_units 24, max work group size 67108864, max sub group size 64, global mem size 67065057280
+ Device 2: 13th Gen Intel(R) Core(TM) i7-13700K, compute capability 3.0,
+ max compute_units 24, max work group size 8192, max sub group size 64, global mem size 67065057280
+ Device 3: Intel(R) Arc(TM) A770 Graphics, compute capability 3.0,
+ max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
+
+```
+
+|Attribute|Note|
+|-|-|
+|compute capability 1.3|Level-zero running time, recommended |
+|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
+
+4. Set device ID and execute llama.cpp
+
+Set device ID = 0 by **GGML_SYCL_DEVICE=0**
+
+```
+GGML_SYCL_DEVICE=0 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
+```
+or run by script:
+
+```
+./examples/sycl/run_llama2.sh
+```
+
+Note:
+
+- By default, mmap is used to read model file. In some cases, it leads to the hang issue. Recommend to use parameter **--no-mmap** to disable mmap() to skip this issue.
+
+
+5. Check the device ID in output
+
+Like:
+```
+Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
+```
+
+
+## Environment Variable
+
+#### Build
+
+|Name|Value|Function|
+|-|-|-|
+|LLAMA_SYCL|ON (mandatory)|Enable build with SYCL code path.
For FP32/FP16, LLAMA_SYCL=ON is mandatory.|
+|LLAMA_SYCL_F16|ON (optional)|Enable FP16 build with SYCL code path. Faster for long-prompt inference.
For FP32, not set it.|
+|CMAKE_C_COMPILER|icx|Use icx compiler for SYCL code path|
+|CMAKE_CXX_COMPILER|icpx|use icpx for SYCL code path|
+
+#### Running
+
+
+|Name|Value|Function|
+|-|-|-|
+|GGML_SYCL_DEVICE|0 (default) or 1|Set the device id used. Check the device ids by default running output|
+|GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG|
+
+## Known Issue
+
+- Error: `error while loading shared libraries: libsycl.so.7: cannot open shared object file: No such file or directory`.
+
+ Miss to enable oneAPI running environment.
+
+ Install oneAPI base toolkit and enable it by: `source /opt/intel/oneapi/setvars.sh`.
+
+
+- Hang during startup
+
+ llama.cpp use mmap as default way to read model file and copy to GPU. In some system, memcpy will be abnormal and block.
+
+ Solution: add **--no-mmap**.
+
+## Todo
+
+- Support to build in Windows.
+
+- Support multiple cards.
diff --git a/common/common.cpp b/common/common.cpp
index 1889c0903..7b11d388b 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -43,6 +43,10 @@
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
+#if (defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL))
+#define GGML_USE_CUBLAS_SYCL
+#endif
+
int32_t get_num_physical_cores() {
#ifdef __linux__
// enumerate the set of thread siblings, num entries is num cores
@@ -600,9 +604,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
break;
}
params.main_gpu = std::stoi(argv[i]);
-#ifndef GGML_USE_CUBLAS
- fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the main GPU has no effect.\n");
-#endif // GGML_USE_CUBLAS
+#ifndef GGML_USE_CUBLAS_SYCL
+ fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the main GPU has no effect.\n");
+#endif // GGML_USE_CUBLAS_SYCL
} else if (arg == "--split-mode" || arg == "-sm") {
if (++i >= argc) {
invalid_param = true;
@@ -619,9 +623,10 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
invalid_param = true;
break;
}
-#ifndef GGML_USE_CUBLAS
- fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the split mode has no effect.\n");
-#endif // GGML_USE_CUBLAS
+#ifndef GGML_USE_CUBLAS_SYCL
+ fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the split mode has no effect.\n");
+#endif // GGML_USE_CUBLAS_SYCL
+
} else if (arg == "--tensor-split" || arg == "-ts") {
if (++i >= argc) {
invalid_param = true;
@@ -644,9 +649,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
params.tensor_split[i] = 0.0f;
}
}
-#ifndef GGML_USE_CUBLAS
- fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting a tensor split has no effect.\n");
-#endif // GGML_USE_CUBLAS
+#ifndef GGML_USE_CUBLAS_SYCL
+ fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting a tensor split has no effect.\n");
+#endif // GGML_USE_CUBLAS_SYCL
} else if (arg == "--no-mmap") {
params.use_mmap = false;
} else if (arg == "--numa") {
@@ -1008,7 +1013,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n");
printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n");
printf(" or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu);
-#endif
+#endif // LLAMA_SUPPORTS_GPU_OFFLOAD
printf(" --verbose-prompt print a verbose prompt before generation (default: %s)\n", params.verbose_prompt ? "true" : "false");
printf(" --no-display-prompt don't print prompt at generation (default: %s)\n", !params.display_prompt ? "true" : "false");
printf(" -gan N, --grp-attn-n N\n");
@@ -1515,7 +1520,6 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false");
fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
- fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
fprintf(stream, "cpu_has_cublas: %s\n", ggml_cpu_has_cublas() ? "true" : "false");
fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false");
fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false");
diff --git a/common/sampling.cpp b/common/sampling.cpp
index efd7eab6e..e8675a8c0 100644
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -13,6 +13,7 @@ struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_
// will be empty (default) if there are parse errors
if (result->parsed_grammar.rules.empty()) {
fprintf(stderr, "%s: failed to parse grammar\n", __func__);
+ delete result;
return nullptr;
}
diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
index 7a0a8c3db..6ab7f486e 100755
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -201,6 +201,8 @@ class Model:
return PlamoModel
if model_architecture == "CodeShellForCausalLM":
return CodeShellModel
+ if model_architecture == "OrionForCausalLM":
+ return OrionModel
return Model
def _is_model_safetensors(self) -> bool:
@@ -250,6 +252,8 @@ class Model:
return gguf.MODEL_ARCH.PLAMO
if arch == "CodeShellForCausalLM":
return gguf.MODEL_ARCH.CODESHELL
+ if arch == "OrionForCausalLM":
+ return gguf.MODEL_ARCH.ORION
raise NotImplementedError(f'Architecture "{arch}" not supported!')
@@ -572,6 +576,83 @@ class MPTModel(Model):
self.gguf_writer.add_tensor("output.weight", data)
+class OrionModel(Model):
+ def set_vocab(self):
+ self._set_vocab_sentencepiece()
+
+ def set_gguf_parameters(self):
+ block_count = self.hparams["num_hidden_layers"]
+ head_count = self.hparams["num_attention_heads"]
+ head_count_kv = self.hparams.get("num_key_value_heads", head_count)
+ hf_repo = self.hparams.get("_name_or_path", "")
+
+ ctx_length = 0
+ if "max_sequence_length" in self.hparams:
+ ctx_length = self.hparams["max_sequence_length"]
+ elif "max_position_embeddings" in self.hparams:
+ ctx_length = self.hparams["max_position_embeddings"]
+ elif "model_max_length" in self.hparams:
+ ctx_length = self.hparams["model_max_length"]
+ else:
+ print("gguf: can not find ctx length parameter.")
+ sys.exit()
+
+ self.gguf_writer.add_file_type(self.ftype)
+ self.gguf_writer.add_name(self.dir_model.name)
+ self.gguf_writer.add_source_hf_repo(hf_repo)
+ self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
+ self.gguf_writer.add_context_length(ctx_length)
+ self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
+ self.gguf_writer.add_block_count(block_count)
+ self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
+ self.gguf_writer.add_head_count(head_count)
+ self.gguf_writer.add_head_count_kv(head_count_kv)
+ self.gguf_writer.add_layer_norm_eps(self.hparams["rms_norm_eps"])
+
+ def write_tensors(self):
+ # Collect tensors from generator object
+ model_kv = dict(self.get_tensors())
+ block_count = self.hparams["num_hidden_layers"]
+ tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
+
+ for name, data_torch in model_kv.items():
+ # we don't need these
+ if name.endswith(".rotary_emb.inv_freq"):
+ continue
+
+ old_dtype = data_torch.dtype
+
+ # convert any unsupported data types to float32
+ if data_torch.dtype not in (torch.float16, torch.float32):
+ data_torch = data_torch.to(torch.float32)
+
+ data = data_torch.squeeze().numpy()
+
+ # map tensor names
+ new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
+ if new_name is None:
+ print(f"Can not map tensor {name!r}")
+ sys.exit()
+
+ n_dims = len(data.shape)
+ data_dtype = data.dtype
+
+ # if f32 desired, convert any float16 to float32
+ if self.ftype == 0 and data_dtype == np.float16:
+ data = data.astype(np.float32)
+
+ # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
+ if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
+ data = data.astype(np.float32)
+
+ # if f16 desired, convert any float32 2-dim weight tensors to float16
+ if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
+ data = data.astype(np.float16)
+
+ print(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
+ self.gguf_writer.add_tensor(new_name, data)
+
+
class BaichuanModel(Model):
def set_vocab(self):
self._set_vocab_sentencepiece()
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index f67d74c55..68ad89964 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -23,6 +23,9 @@ else()
add_subdirectory(infill)
add_subdirectory(llama-bench)
add_subdirectory(llava)
+ if (LLAMA_SYCL)
+ add_subdirectory(sycl)
+ endif()
add_subdirectory(main)
add_subdirectory(tokenize)
add_subdirectory(parallel)
diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp
index 890b75cdc..774b54eac 100644
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@@ -242,7 +242,7 @@ int main(int argc, char ** argv) {
LOG("add_bos: %d\n", add_bos);
bool suff_rm_leading_spc = params.escape;
- if (suff_rm_leading_spc && params.input_suffix.find_first_of(" ") == 0 && params.input_suffix.size() > 1) {
+ if (suff_rm_leading_spc && params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
params.input_suffix.erase(0, 1);
suff_rm_leading_spc = false;
}
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index 4a0338a37..9129052a2 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -98,6 +98,7 @@ static std::string format(const char * fmt, ...) {
enum projector_type {
PROJECTOR_TYPE_MLP,
+ PROJECTOR_TYPE_MLP_NORM,
PROJECTOR_TYPE_LDP,
PROJECTOR_TYPE_UNKNOWN,
};
@@ -304,10 +305,18 @@ struct clip_vision_model {
struct ggml_tensor * projection;
// LLaVA projection
- struct ggml_tensor * mm_0_w;
- struct ggml_tensor * mm_0_b;
- struct ggml_tensor * mm_2_w;
- struct ggml_tensor * mm_2_b;
+ struct ggml_tensor * mm_0_w = NULL;
+ struct ggml_tensor * mm_0_b = NULL;
+ struct ggml_tensor * mm_2_w = NULL;
+ struct ggml_tensor * mm_2_b = NULL;
+
+ // Yi type models with mlp+normalization projection
+ struct ggml_tensor * mm_1_w = NULL; // Yi type models have 0, 1, 3, 4
+ struct ggml_tensor * mm_1_b = NULL;
+ struct ggml_tensor * mm_3_w = NULL;
+ struct ggml_tensor * mm_3_b = NULL;
+ struct ggml_tensor * mm_4_w = NULL;
+ struct ggml_tensor * mm_4_b = NULL;
// MobileVLM projection
struct ggml_tensor * mm_model_mlp_1_w;
@@ -460,6 +469,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
// pre-layernorm
{
embeddings = ggml_norm(ctx0, embeddings, eps);
+ ggml_set_name(embeddings, "pre_ln");
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.pre_ln_w), model.pre_ln_b);
}
@@ -575,6 +585,27 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
+
+ } else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
+ embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
+ embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
+ // ggml_tensor_printf(embeddings, "mm_0_w",0,true,false);
+ // First LayerNorm
+ embeddings = ggml_norm(ctx0, embeddings, eps);
+ embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_1_w),
+ model.mm_1_b);
+
+ // GELU activation
+ embeddings = ggml_gelu(ctx0, embeddings);
+
+ // Second linear layer
+ embeddings = ggml_mul_mat(ctx0, model.mm_3_w, embeddings);
+ embeddings = ggml_add(ctx0, embeddings, model.mm_3_b);
+
+ // Second LayerNorm
+ embeddings = ggml_norm(ctx0, embeddings, eps);
+ embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_4_w),
+ model.mm_4_b);
}
else if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
// MobileVLM projector
@@ -808,6 +839,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
else {
new_clip->proj_type = PROJECTOR_TYPE_MLP;
}
+ if (new_clip->proj_type == PROJECTOR_TYPE_MLP) {
+ if (gguf_find_tensor(ctx, format(TN_LLAVA_PROJ, 3, "weight").c_str()) != -1) {
+ new_clip->proj_type = PROJECTOR_TYPE_MLP_NORM;
+ }
+ }
}
#ifdef GGML_USE_CUBLAS
@@ -956,11 +992,29 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
// LLaVA projection
- if (new_clip->proj_type == PROJECTOR_TYPE_MLP) {
+ if (new_clip->proj_type == PROJECTOR_TYPE_MLP || new_clip->proj_type == PROJECTOR_TYPE_MLP_NORM) {
vision_model.mm_0_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight"));
vision_model.mm_0_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias"));
- vision_model.mm_2_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
- vision_model.mm_2_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
+ try {
+ // Yi-type llava
+ vision_model.mm_1_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 1, "weight"));
+ vision_model.mm_1_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 1, "bias"));
+ } catch (std::runtime_error & e) { }
+ try {
+ // missing in Yi-type llava
+ vision_model.mm_2_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
+ vision_model.mm_2_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
+ } catch (std::runtime_error & e) { }
+ try {
+ // Yi-type llava
+ vision_model.mm_3_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 3, "weight"));
+ vision_model.mm_3_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 3, "bias"));
+ } catch (std::runtime_error & e) { }
+ try {
+ // Yi-type llava
+ vision_model.mm_4_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "weight"));
+ vision_model.mm_4_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "bias"));
+ } catch (std::runtime_error & e) { }
}
else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) {
// MobileVLM projection
@@ -1277,7 +1331,6 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
".*weight",
};
- std::vector read_data(512);
std::vector work(512);
std::vector conv_buf(512);
std::vector hist_all(1 << 4, 0);
@@ -1433,6 +1486,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
}
else if (ctx->proj_type == PROJECTOR_TYPE_MLP) {
return ctx->vision_model.mm_2_b->ne[0];
+ } else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
+ return ctx->vision_model.mm_3_b->ne[0];
}
else {
std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp
index d94795fe3..6ac70ba69 100644
--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@@ -148,10 +148,35 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx_llava->ctx_llama));
- // llava chat format is "\nUSER:\n\nASSISTANT:"
- eval_string(ctx_llava->ctx_llama, "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:", params->n_batch, &n_past, add_bos);
+ std::string system_prompt, user_prompt;
+ size_t image_pos = prompt.find("");
+ if (image_pos != std::string::npos) {
+ // new templating mode: Provide the full prompt including system message and use as a placeholder for the image
+
+ system_prompt = prompt.substr(0, image_pos);
+ user_prompt = prompt.substr(image_pos + std::string("").length());
+ // We replace \n with actual newlines in user_prompt, just in case -e was not used in templating string
+ size_t pos = 0;
+ while ((pos = user_prompt.find("\\n", pos)) != std::string::npos) {
+ user_prompt.replace(pos, 2, "\n");
+ pos += 1; // Advance past the replaced newline
+ }
+ while ((pos = system_prompt.find("\\n", pos)) != std::string::npos) {
+ system_prompt.replace(pos, 2, "\n");
+ pos += 1; // Advance past the replaced newline
+ }
+
+ printf("system_prompt: %s\n", system_prompt.c_str());
+ printf("user_prompt: %s\n", user_prompt.c_str());
+ } else {
+ // llava-1.5 native mode
+ system_prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:";
+ user_prompt = prompt + "\nASSISTANT:";
+ }
+
+ eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, add_bos);
llava_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past);
- eval_string(ctx_llava->ctx_llama, (prompt + "\nASSISTANT:").c_str(), params->n_batch, &n_past, false);
+ eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
// generate the response
@@ -162,6 +187,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
for (int i = 0; i < max_tgt_len; i++) {
const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past);
if (strcmp(tmp, "") == 0) break;
+ if (strstr(tmp, "###")) break; // Yi-VL behavior
printf("%s", tmp);
fflush(stdout);
diff --git a/examples/server/README.md b/examples/server/README.md
index fd3034b99..dce4ec47c 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -30,7 +30,8 @@ Command line options:
- `-cb`, `--cont-batching`: enable continuous batching (a.k.a dynamic batching) (default: disabled)
- `-spf FNAME`, `--system-prompt-file FNAME` Set a file to load "a system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
- `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA.
-
+- `--grp-attn-n`: Set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`
+- `--grp-attn-w`: Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`
## Build
server is build alongside everything else from the root of the project
@@ -65,6 +66,14 @@ server.exe -m models\7B\ggml-model.gguf -c 2048
The above command will start a server that by default listens on `127.0.0.1:8080`.
You can consume the endpoints with Postman or NodeJS with axios library. You can visit the web front end at the same url.
+### Docker:
+```bash
+docker run -p 8080:8080 -v /path/to/models:/models ggerganov/llama.cpp:server -m models/7B/ggml-model.gguf -c 512 --host 0.0.0.0 --port 8080
+
+# or, with CUDA:
+docker run -p 8080:8080 -v /path/to/models:/models --gpus all ggerganov/llama.cpp:server-cuda -m models/7B/ggml-model.gguf -c 512 --host 0.0.0.0 --port 8080 --n-gpu-layers 99
+```
+
## Testing with CURL
Using [curl](https://curl.se/). On Windows `curl.exe` should be available in the base OS.
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index eb9555ab2..f7c5847c7 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -185,6 +185,12 @@ struct llama_client_slot
struct llama_sampling_params sparams;
llama_sampling_context *ctx_sampling = nullptr;
+ int32_t ga_i = 0; // group-attention state
+ int32_t ga_n = 1;// group-attention factor
+ int32_t ga_w = 512; // group-attention width
+
+ int32_t n_past_se = 0; // self-extend
+
// multimodal
std::vector images;
@@ -213,7 +219,8 @@ struct llama_client_slot
sent_count = 0;
sent_token_probs_index = 0;
infill = false;
-
+ ga_i = 0;
+ n_past_se = 0;
generated_token_probs.clear();
for (slot_image & img : images)
@@ -400,9 +407,26 @@ struct llama_server_context
slot.id = i;
slot.n_ctx = n_ctx_slot;
- slot.reset();
LOG_TEE(" -> Slot %i - max context: %i\n", slot.id, n_ctx_slot);
+
+ const int ga_n = params.grp_attn_n;
+ const int ga_w = params.grp_attn_w;
+
+ if (ga_n != 1) {
+ GGML_ASSERT(ga_n > 0 && "ga_n must be positive"); // NOLINT
+ GGML_ASSERT(ga_w % ga_n == 0 && "ga_w must be a multiple of ga_n"); // NOLINT
+ //GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of ga_w"); // NOLINT
+ //GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * ga_n"); // NOLINT
+ LOG_TEE(" -> Slot %i - self-extend: ga_n = %d, ga_w = %d\n", slot.id, ga_n, ga_w);
+ }
+
+ slot.ga_i = 0;
+ slot.ga_n = ga_n;
+ slot.ga_w = ga_w;
+
+ slot.reset();
+
slots.push_back(slot);
}
@@ -658,7 +682,7 @@ struct llama_server_context
while ((pos = prompt.find(pattern, pos)) != std::string::npos) {
size_t end_prefix = pos;
pos += pattern.length();
- size_t end_pos = prompt.find("]", pos);
+ size_t end_pos = prompt.find(']', pos);
if (end_pos != std::string::npos)
{
std::string image_id = prompt.substr(pos, end_pos - pos);
@@ -1350,32 +1374,35 @@ struct llama_server_context
for (llama_client_slot &slot : slots)
{
- if (slot.is_processing() && slot.cache_tokens.size() >= (size_t) slot.n_ctx)
+ if (slot.ga_n == 1)
{
- // Shift context
- const int n_left = slot.n_past - slot.params.n_keep - 1;
- const int n_discard = n_left / 2;
-
- LOG_TEE("slot %d: context shift - n_keep = %d, n_left = %d, n_discard = %d\n", slot.id, slot.params.n_keep, n_left, n_discard);
- llama_kv_cache_seq_rm (ctx, slot.id, slot.params.n_keep + 1 , slot.params.n_keep + n_discard + 1);
- llama_kv_cache_seq_shift(ctx, slot.id, slot.params.n_keep + 1 + n_discard, slot.n_past, -n_discard);
-
- for (size_t i = slot.params.n_keep + 1 + n_discard; i < slot.cache_tokens.size(); i++)
+ if (slot.is_processing() && slot.cache_tokens.size() >= (size_t) slot.n_ctx)
{
- slot.cache_tokens[i - n_discard] = slot.cache_tokens[i];
+ // Shift context
+ const int n_left = slot.n_past - slot.params.n_keep - 1;
+ const int n_discard = n_left / 2;
+
+ LOG_TEE("slot %d: context shift - n_keep = %d, n_left = %d, n_discard = %d\n", slot.id, slot.params.n_keep, n_left, n_discard);
+ llama_kv_cache_seq_rm (ctx, slot.id, slot.params.n_keep + 1 , slot.params.n_keep + n_discard + 1);
+ llama_kv_cache_seq_shift(ctx, slot.id, slot.params.n_keep + 1 + n_discard, slot.n_past, -n_discard);
+
+ for (size_t i = slot.params.n_keep + 1 + n_discard; i < slot.cache_tokens.size(); i++)
+ {
+ slot.cache_tokens[i - n_discard] = slot.cache_tokens[i];
+ }
+
+ slot.cache_tokens.resize(slot.cache_tokens.size() - n_discard);
+
+ slot.n_past -= n_discard;
+
+ slot.truncated = true;
+
+ LOG_VERBOSE("context shift", {
+ { "n_ctx", n_ctx },
+ { "n_keep", params.n_keep },
+ { "n_left", n_left },
+ });
}
-
- slot.cache_tokens.resize(slot.cache_tokens.size() - n_discard);
-
- slot.n_past -= n_discard;
-
- slot.truncated = true;
-
- LOG_VERBOSE("context shift", {
- {"n_ctx", n_ctx},
- {"n_keep", params.n_keep},
- {"n_left", n_left},
- });
}
}
@@ -1402,7 +1429,8 @@ struct llama_server_context
slot.i_batch = batch.n_tokens;
- llama_batch_add(batch, slot.sampled, system_tokens.size() + slot.n_past, { slot.id }, true);
+ const int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past;
+ llama_batch_add(batch, slot.sampled, system_tokens.size() + slot_npast, { slot.id }, true);
slot.n_past += 1;
}
@@ -1500,6 +1528,8 @@ struct llama_server_context
llama_sampling_reset(slot.ctx_sampling);
slot.n_past = 0;
+ slot.n_past_se = 0;
+ slot.ga_i = 0;
slot.num_prompt_tokens_processed = slot.num_prompt_tokens;
}
else
@@ -1513,6 +1543,25 @@ struct llama_server_context
slot.n_past = common_part(slot.cache_tokens, prompt_tokens);
slot.num_prompt_tokens_processed = slot.num_prompt_tokens - slot.n_past;
+ if (slot.ga_n != 1)
+ {
+ int ga_i = 0;
+ int32_t ga_n = slot.ga_n;
+ int32_t ga_w = slot.ga_w;
+ int32_t slot_npast = 0;
+ for (int k = 0; k < slot.n_past; ++k)
+ {
+ while (slot_npast >= ga_i + ga_w) {
+ const int bd = (ga_w/ga_n)*(ga_n - 1);
+ slot_npast -= bd;
+ ga_i += ga_w/ga_n;
+ }
+ slot_npast++;
+ }
+ slot.n_past_se = slot_npast;
+ slot.ga_i = ga_i;
+ }
+
LOG_TEE("slot %d : in cache: %i tokens | to process: %i tokens\n", slot.id, slot.n_past, slot.num_prompt_tokens_processed);
}
@@ -1527,6 +1576,10 @@ struct llama_server_context
// we have to evaluate at least 1 token to generate logits.
LOG_TEE("slot %d : we have to evaluate at least 1 token to generate logits\n", slot.id);
slot.n_past--;
+ if (slot.ga_i > 0)
+ {
+ slot.n_past_se--;
+ }
}
LOG_VERBOSE("prompt ingested", {
@@ -1539,9 +1592,22 @@ struct llama_server_context
// process the prefix of first image
std::vector prefix_tokens = has_images ? tokenize(slot.images[0].prefix_prompt, add_bos_token) : prompt_tokens;
+ int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past;
+ int ga_i = slot.ga_i;
+ int32_t ga_n = slot.ga_n;
+ int32_t ga_w = slot.ga_w;
for (; slot.n_past < (int) prefix_tokens.size(); ++slot.n_past)
{
- llama_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot.n_past, { slot.id }, false);
+ if (slot.ga_n != 1)
+ {
+ while (slot_npast >= ga_i + ga_w) {
+ const int bd = (ga_w/ga_n)*(ga_n - 1);
+ slot_npast -= bd;
+ ga_i += ga_w/ga_n;
+ }
+ }
+ llama_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot_npast, {slot.id }, false);
+ slot_npast += 1;
}
if (has_images && !ingest_images(slot, n_batch))
@@ -1571,6 +1637,36 @@ struct llama_server_context
for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch)
{
const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
+
+ for (auto & slot : slots)
+ {
+ if (slot.ga_n != 1)
+ {
+ // context extension via Self-Extend
+ while (slot.n_past_se >= slot.ga_i + slot.ga_w)
+ {
+ const int ib = (slot.ga_n * slot.ga_i) / slot.ga_w;
+ const int bd = (slot.ga_w / slot.ga_n) * (slot.ga_n - 1);
+ const int dd = (slot.ga_w / slot.ga_n) - ib * bd - slot.ga_w;
+
+ LOG_TEE("\n");
+ LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i, slot.n_past_se, ib * bd, slot.ga_i + ib * bd, slot.n_past_se + ib * bd);
+ LOG_TEE("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n);
+ LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd);
+
+ llama_kv_cache_seq_shift(ctx, slot.id, slot.ga_i, slot.n_past_se, ib * bd);
+ llama_kv_cache_seq_div(ctx, slot.id, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w,slot.ga_n);
+ llama_kv_cache_seq_shift(ctx, slot.id, slot.ga_i + ib * bd + slot.ga_w,slot.n_past_se + ib * bd, dd);
+
+ slot.n_past_se -= bd;
+
+ slot.ga_i += slot.ga_w / slot.ga_n;
+
+ LOG_TEE("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", slot.n_past_se + bd, slot.n_past_se, slot.ga_i);
+ }
+ slot.n_past_se += n_tokens;
+ }
+ }
llama_batch batch_view =
{
n_tokens,
@@ -1584,6 +1680,7 @@ struct llama_server_context
};
const int ret = llama_decode(ctx, batch_view);
+
if (ret != 0)
{
if (n_batch == 1 || ret < 0)
@@ -1729,6 +1826,8 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
printf(" --override-kv KEY=TYPE:VALUE\n");
printf(" advanced option to override model metadata by key. may be specified multiple times.\n");
printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
+ printf(" -gan N, --grp-attn-n N Set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`");
+ printf(" -gaw N, --grp-attn-w N Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`");
printf("\n");
}
@@ -1914,6 +2013,25 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
}
params.n_threads = std::stoi(argv[i]);
}
+ else if (arg == "--grp-attn-n" || arg == "-gan")
+ {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ }
+
+ params.grp_attn_n = std::stoi(argv[i]);
+ }
+ else if (arg == "--grp-attn-w" || arg == "-gaw")
+ {
+ if (++i >= argc)
+ {
+ invalid_param = true;
+ break;
+ }
+
+ params.grp_attn_w = std::stoi(argv[i]);
+ }
else if (arg == "--threads-batch" || arg == "-tb")
{
if (++i >= argc)
@@ -1982,7 +2100,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
invalid_param = true;
break;
}
-#ifdef GGML_USE_CUBLAS
+#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)
std::string arg_next = argv[i];
// split string by , and /
@@ -2008,7 +2126,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
}
else if (arg == "--no-mul-mat-q" || arg == "-nommq")
{
-#ifdef GGML_USE_CUBLAS
+#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)
params.mul_mat_q = false;
#else
LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n", {});
@@ -2021,7 +2139,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
invalid_param = true;
break;
}
-#ifdef GGML_USE_CUBLAS
+#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)
params.main_gpu = std::stoi(argv[i]);
#else
LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.", {});
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index e2b6065f7..70cce0721 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -249,6 +249,7 @@ struct llama_server_queue {
}
// Start the main loop. This call is blocking
+ [[noreturn]]
void start_loop() {
while (true) {
// new task arrived
diff --git a/examples/sycl/CMakeLists.txt b/examples/sycl/CMakeLists.txt
new file mode 100644
index 000000000..69cf8932e
--- /dev/null
+++ b/examples/sycl/CMakeLists.txt
@@ -0,0 +1,9 @@
+# MIT license
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: MIT
+
+set(TARGET ls-sycl-device)
+add_executable(${TARGET} ls-sycl-device.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/sycl/README.md b/examples/sycl/README.md
new file mode 100644
index 000000000..b46f17f39
--- /dev/null
+++ b/examples/sycl/README.md
@@ -0,0 +1,47 @@
+# llama.cpp/example/sycl
+
+This example program provide the tools for llama.cpp for SYCL on Intel GPU.
+
+## Tool
+
+|Tool Name| Function|Status|
+|-|-|-|
+|ls-sycl-device| List all SYCL devices with ID, compute capability, max work group size, ect.|Support|
+
+### ls-sycl-device
+
+List all SYCL devices with ID, compute capability, max work group size, ect.
+
+1. Build the llama.cpp for SYCL for all targets.
+
+2. Enable oneAPI running environment
+
+```
+source /opt/intel/oneapi/setvars.sh
+```
+
+3. Execute
+
+```
+./build/bin/ls-sycl-device
+```
+
+Check the ID in startup log, like:
+
+```
+found 4 SYCL devices:
+ Device 0: Intel(R) Arc(TM) A770 Graphics, compute capability 1.3,
+ max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
+ Device 1: Intel(R) FPGA Emulation Device, compute capability 1.2,
+ max compute_units 24, max work group size 67108864, max sub group size 64, global mem size 67065057280
+ Device 2: 13th Gen Intel(R) Core(TM) i7-13700K, compute capability 3.0,
+ max compute_units 24, max work group size 8192, max sub group size 64, global mem size 67065057280
+ Device 3: Intel(R) Arc(TM) A770 Graphics, compute capability 3.0,
+ max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
+
+```
+
+|Attribute|Note|
+|-|-|
+|compute capability 1.3|Level-zero running time, recommended |
+|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
diff --git a/examples/sycl/build.sh b/examples/sycl/build.sh
new file mode 100755
index 000000000..26ad2f7da
--- /dev/null
+++ b/examples/sycl/build.sh
@@ -0,0 +1,20 @@
+
+# MIT license
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: MIT
+
+mkdir -p build
+cd build
+source /opt/intel/oneapi/setvars.sh
+
+#for FP16
+#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON # faster for long-prompt inference
+
+#for FP32
+cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+
+#build example/main only
+#cmake --build . --config Release --target main
+
+#build all binary
+cmake --build . --config Release -v
diff --git a/examples/sycl/ls-sycl-device.cpp b/examples/sycl/ls-sycl-device.cpp
new file mode 100644
index 000000000..42847154a
--- /dev/null
+++ b/examples/sycl/ls-sycl-device.cpp
@@ -0,0 +1,11 @@
+/*MIT license
+ Copyright (C) 2024 Intel Corporation
+ SPDX-License-Identifier: MIT
+*/
+
+#include "ggml-sycl.h"
+
+int main(int argc, char ** argv) {
+ ggml_backend_sycl_print_sycl_devices();
+ return 0;
+}
diff --git a/examples/sycl/run-llama2.sh b/examples/sycl/run-llama2.sh
new file mode 100755
index 000000000..f5f4c1e98
--- /dev/null
+++ b/examples/sycl/run-llama2.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+# MIT license
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: MIT
+
+INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
+source /opt/intel/oneapi/setvars.sh
+
+if [ $# -gt 0 ]; then
+ export GGML_SYCL_DEVICE=$1
+else
+ export GGML_SYCL_DEVICE=0
+fi
+echo GGML_SYCL_DEVICE=$GGML_SYCL_DEVICE
+#export GGML_SYCL_DEBUG=1
+./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
+#./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 5 -e -ngl 33 -t 1 -s 0
+
diff --git a/ggml-backend.c b/ggml-backend.c
index dcde54418..8b6cf7c9f 100644
--- a/ggml-backend.c
+++ b/ggml-backend.c
@@ -358,6 +358,11 @@ GGML_CALL static void ggml_backend_registry_init(void) {
ggml_backend_cuda_reg_devices();
#endif
+#ifdef GGML_USE_SYCL
+ extern void ggml_backend_sycl_reg_devices(void);
+ ggml_backend_sycl_reg_devices();
+#endif
+
#ifdef GGML_USE_METAL
extern GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data);
extern GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
diff --git a/ggml-metal.m b/ggml-metal.m
index d2deb55a8..d791de547 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -24,10 +24,7 @@
#define UNUSED(x) (void)(x)
-#define GGML_METAL_MAX_KERNELS 256
-
struct ggml_metal_kernel {
- id function;
id pipeline;
};
@@ -159,11 +156,10 @@ struct ggml_metal_context {
id device;
id queue;
- id library;
dispatch_queue_t d_queue;
- struct ggml_metal_kernel kernels[GGML_METAL_MAX_KERNELS];
+ struct ggml_metal_kernel kernels[GGML_METAL_KERNEL_TYPE_COUNT];
bool support_simdgroup_reduction;
bool support_simdgroup_mm;
@@ -246,6 +242,8 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
ctx->queue = [ctx->device newCommandQueue];
ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
+ id metal_library;
+
// load library
{
NSBundle * bundle = nil;
@@ -260,7 +258,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
// pre-compiled library found
NSURL * libURL = [NSURL fileURLWithPath:libPath];
GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [libPath UTF8String]);
- ctx->library = [ctx->device newLibraryWithURL:libURL error:&error];
+ metal_library = [ctx->device newLibraryWithURL:libURL error:&error];
if (error) {
GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
return NULL;
@@ -302,7 +300,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
//[options setFastMathEnabled:false];
- ctx->library = [ctx->device newLibraryWithSource:src options:options error:&error];
+ metal_library = [ctx->device newLibraryWithSource:src options:options error:&error];
if (error) {
GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
return NULL;
@@ -367,8 +365,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
{
NSError * error = nil;
- for (int i = 0; i < GGML_METAL_MAX_KERNELS; ++i) {
- ctx->kernels[i].function = nil;
+ for (int i = 0; i < GGML_METAL_KERNEL_TYPE_COUNT; ++i) {
ctx->kernels[i].pipeline = nil;
}
@@ -380,10 +377,12 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
#define GGML_METAL_ADD_KERNEL(e, name, supported) \
if (supported) { \
struct ggml_metal_kernel * kernel = &ctx->kernels[e]; \
- kernel->function = [ctx->library newFunctionWithName:@"kernel_"#name]; \
- kernel->pipeline = [ctx->device newComputePipelineStateWithFunction:kernel->function error:&error]; \
+ id metal_function = [metal_library newFunctionWithName:@"kernel_"#name]; \
+ kernel->pipeline = [ctx->device newComputePipelineStateWithFunction:metal_function error:&error]; \
+ [metal_function release]; \
if (error) { \
GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
+ [metal_library release]; \
return NULL; \
} \
} else { \
@@ -512,23 +511,17 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUM_ROWS, sum_rows, true);
}
+ [metal_library release];
return ctx;
}
static void ggml_metal_free(struct ggml_metal_context * ctx) {
GGML_METAL_LOG_INFO("%s: deallocating\n", __func__);
- for (int i = 0; i < GGML_METAL_MAX_KERNELS; ++i) {
- if (ctx->kernels[i].pipeline) {
- [ctx->kernels[i].pipeline release];
- }
-
- if (ctx->kernels[i].function) {
- [ctx->kernels[i].function release];
- }
+ for (int i = 0; i < GGML_METAL_KERNEL_TYPE_COUNT; ++i) {
+ [ctx->kernels[i].pipeline release];
}
- [ctx->library release];
[ctx->queue release];
[ctx->device release];
@@ -2382,6 +2375,16 @@ GGML_CALL static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backen
UNUSED(buft);
}
+GGML_CALL static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
+ id device = ggml_backend_metal_get_device();
+ size_t max_size = device.maxBufferLength;
+ ggml_backend_metal_free_device();
+
+ return max_size;
+
+ UNUSED(buft);
+}
+
GGML_CALL static bool ggml_backend_metal_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
return ggml_backend_is_metal(backend) || ggml_backend_is_cpu(backend);
@@ -2400,7 +2403,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
/* .get_name = */ ggml_backend_metal_buffer_type_get_name,
/* .alloc_buffer = */ ggml_backend_metal_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment,
- /* .get_max_size = */ NULL, // TODO: return device.maxBufferLength
+ /* .get_max_size = */ ggml_backend_metal_buffer_type_get_max_size,
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
/* .supports_backend = */ ggml_backend_metal_buffer_type_supports_backend,
/* .is_host = */ ggml_backend_metal_buffer_type_is_host,
diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp
index edd802bb9..d52e75c4e 100644
--- a/ggml-opencl.cpp
+++ b/ggml-opencl.cpp
@@ -2134,6 +2134,15 @@ static size_t ggml_backend_opencl_buffer_type_get_alignment(ggml_backend_buffer_
GGML_UNUSED(buffer_type);
}
+static size_t ggml_backend_opencl_buffer_type_get_max_size(ggml_backend_buffer_type_t buffer_type) {
+ static size_t max_size = -1;
+ if (max_size == (size_t)-1) {
+ ggml_cl_init();
+ clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &max_size, NULL);
+ }
+ return max_size;
+}
+
static bool ggml_backend_opencl_buffer_type_supports_backend(ggml_backend_buffer_type_t buffer_type, ggml_backend_t backend) {
//return ggml_backend_is_opencl(backend); // opencl must be used through the cpu backend
return ggml_backend_is_cpu(backend);
@@ -2145,7 +2154,7 @@ static ggml_backend_buffer_type_i ggml_backend_opencl_buffer_type_interface = {
/* .get_name = */ ggml_backend_opencl_buffer_type_name,
/* .alloc_buffer = */ ggml_backend_opencl_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_opencl_buffer_type_get_alignment,
- /* .get_max_size = */ NULL, // TODO: return from device info
+ /* .get_max_size = */ ggml_backend_opencl_buffer_type_get_max_size,
/* .get_alloc_size = */ NULL,
/* .supports_backend = */ ggml_backend_opencl_buffer_type_supports_backend,
/* .is_host = */ NULL,
diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
new file mode 100644
index 000000000..3fc346975
--- /dev/null
+++ b/ggml-sycl.cpp
@@ -0,0 +1,15199 @@
+/*MIT license
+ Copyright (C) 2024 Intel Corporation
+ SPDX-License-Identifier: MIT
+*/
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include
+#include
+
+
+#include
+#include
+
+#include "ggml-sycl.h"
+#include "ggml.h"
+#include "ggml-backend-impl.h"
+
+/*
+Following definition copied from DPCT head files, which are used by ggml-sycl.cpp
+*/
+// COPY from DPCT head files
+#include
+#include
+#include