diff --git a/.devops/intel.Dockerfile b/.devops/intel.Dockerfile
index 3d1abe612..0e298140b 100644
--- a/.devops/intel.Dockerfile
+++ b/.devops/intel.Dockerfile
@@ -5,8 +5,15 @@ ARG ONEAPI_VERSION=2025.3.3-0-devel-ubuntu24.04
FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS build
ARG GGML_SYCL_F16=OFF
+ARG LEVEL_ZERO_VERSION=1.28.2
+ARG LEVEL_ZERO_UBUNTU_VERSION=u24.04
RUN apt-get update && \
- apt-get install -y git libssl-dev
+ apt-get install -y git libssl-dev wget ca-certificates && \
+ cd /tmp && \
+ wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb && \
+ wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb && \
+ apt-get -o Dpkg::Options::="--force-overwrite" install -y ./level-zero.deb ./level-zero-devel.deb && \
+ rm -f /tmp/level-zero.deb /tmp/level-zero-devel.deb
WORKDIR /app
@@ -109,4 +116,3 @@ WORKDIR /app
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
ENTRYPOINT [ "/app/llama-server" ]
-
diff --git a/.github/workflows/build-sycl.yml b/.github/workflows/build-sycl.yml
index 2a6642292..09635f64e 100644
--- a/.github/workflows/build-sycl.yml
+++ b/.github/workflows/build-sycl.yml
@@ -50,6 +50,8 @@ jobs:
env:
ONEAPI_ROOT: /opt/intel/oneapi/
ONEAPI_INSTALLER_VERSION: "2025.3.3"
+ LEVEL_ZERO_VERSION: "1.28.2"
+ LEVEL_ZERO_UBUNTU_VERSION: "u24.04"
continue-on-error: true
@@ -71,6 +73,14 @@ jobs:
wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
+ - name: Install Level Zero SDK
+ shell: bash
+ run: |
+ cd /tmp
+ wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb
+ wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb
+ sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb
+
- name: Clone
id: checkout
uses: actions/checkout@v6
@@ -107,6 +117,7 @@ jobs:
env:
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
+ LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip
ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
ONEAPI_INSTALLER_VERSION: "2025.3.3"
steps:
@@ -127,6 +138,13 @@ jobs:
run: |
scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
+ - name: Install Level Zero SDK
+ shell: pwsh
+ run: |
+ Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip"
+ Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force
+ "LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append
+
- name: ccache
uses: ggml-org/ccache-action@v1.2.21
with:
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 924f6cd3f..00e37c3e6 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -600,6 +600,7 @@ jobs:
env:
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
+ LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip
ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
ONEAPI_INSTALLER_VERSION: "2025.3.3"
@@ -621,6 +622,13 @@ jobs:
run: |
scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
+ - name: Install Level Zero SDK
+ shell: pwsh
+ run: |
+ Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip"
+ Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force
+ "LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append
+
- name: ccache
uses: ggml-org/ccache-action@v1.2.21
with:
@@ -655,6 +663,13 @@ jobs:
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin
+ ZE_LOADER_DLL=$(find "${{ env.ONEAPI_ROOT }}" "$LEVEL_ZERO_V1_SDK_PATH" -iname ze_loader.dll -print -quit 2>/dev/null || true)
+ if [ -n "$ZE_LOADER_DLL" ]; then
+ echo "Using Level Zero loader: $ZE_LOADER_DLL"
+ cp "$ZE_LOADER_DLL" ./build/bin
+ else
+ echo "Level Zero loader DLL not found in oneAPI or SDK; relying on system driver/runtime"
+ fi
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
@@ -695,6 +710,8 @@ jobs:
env:
ONEAPI_ROOT: /opt/intel/oneapi/
ONEAPI_INSTALLER_VERSION: "2025.3.3"
+ LEVEL_ZERO_VERSION: "1.28.2"
+ LEVEL_ZERO_UBUNTU_VERSION: "u24.04"
steps:
- name: Clone
@@ -718,6 +735,14 @@ jobs:
wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
+ - name: Install Level Zero SDK
+ shell: bash
+ run: |
+ cd /tmp
+ wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb
+ wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb
+ sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb
+
- name: ccache
uses: ggml-org/ccache-action@v1.2.21
with:
diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md
index f66facc85..155f933b8 100644
--- a/docs/backend/SYCL.md
+++ b/docs/backend/SYCL.md
@@ -720,6 +720,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
| GGML_SYCL_GRAPH | OFF *(default)* \|ON *(Optional)* | Enable build with [SYCL Graph extension](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc). |
| GGML_SYCL_DNN | ON *(default)* \|OFF *(Optional)* | Enable build with oneDNN. |
| GGML_SYCL_HOST_MEM_FALLBACK | ON *(default)* \|OFF *(Optional)* | Allow host memory fallback when device memory is full during quantized weight reorder. Enables inference to continue at reduced speed (reading over PCIe) instead of failing. Requires Linux kernel 6.8+. |
+| GGML_SYCL_SUPPORT_LEVEL_ZERO | ON *(default)* \|OFF *(Optional)* | Enable Level Zero API for device memory allocation. Requires Level Zero headers/library at build time and Intel GPU driver (Level Zero runtime) at run time. Reduces system RAM usage during multi-GPU inference. |
| CMAKE_C_COMPILER | `icx` *(Linux)*, `icx/cl` *(Windows)* | Set `icx` compiler for SYCL code path. |
| CMAKE_CXX_COMPILER | `icpx` *(Linux)*, `icx` *(Windows)* | Set `icpx/icx` compiler for SYCL code path. |
@@ -733,9 +734,10 @@ use 1 SYCL GPUs: [0] with Max compute units:512
| GGML_SYCL_ENABLE_FLASH_ATTN | 1 (default) or 0| Enable Flash-Attention. It can reduce memory usage. The performance impact depends on the LLM.|
| GGML_SYCL_DISABLE_OPT | 0 (default) or 1 | Disable optimize features for Intel GPUs. (Recommended to 1 for intel devices older than Gen 10) |
| GGML_SYCL_DISABLE_GRAPH | 0 or 1 (default) | Disable running computations through SYCL Graphs feature. Disabled by default because SYCL Graph is still on development, no better performance. |
+| GGML_SYCL_ENABLE_LEVEL_ZERO | 1 (default) or 0 | Use Level Zero API for device memory allocation instead of SYCL. Reduces system RAM usage on Intel dGPUs by avoiding DMA-buf/TTM host memory staging. Requires GGML_SYCL_SUPPORT_LEVEL_ZERO=ON at build time. |
| GGML_SYCL_DISABLE_DNN | 0 (default) or 1 | Disable running computations through oneDNN and always use oneMKL. |
| ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.
Recommended to use when --split-mode = layer |
-| UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS | 0 (default) or 1 | Support malloc device memory more than 4GB.|
+| UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS | 0 (default) or 1 | Allow SYCL/Unified Runtime Level Zero device allocations larger than 4 GiB. llama.cpp's direct Level Zero allocation path requests the relaxed maximum-size limit itself when GGML_SYCL_ENABLE_LEVEL_ZERO=1. |
## Compile-time Flags
@@ -819,7 +821,7 @@ Pass these via `CXXFLAGS` or add a one-off `#define` to enable a flag on the spo
- `ggml_backend_sycl_buffer_type_alloc_buffer: can't allocate 5000000000 Bytes of memory on device`
- You need to enable to support 4GB memory malloc by:
+ With the default `GGML_SYCL_ENABLE_LEVEL_ZERO=1`, llama.cpp requests Level Zero's relaxed maximum-size allocation limit directly. If Level Zero support is disabled at build time or runtime and the allocation goes through SYCL/Unified Runtime instead, enable support for allocations larger than 4 GiB by:
```
export UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
set UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
index 672b37dff..de765ef3e 100644
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -249,6 +249,7 @@ option(GGML_SYCL "ggml: use SYCL"
option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF)
option(GGML_SYCL_GRAPH "ggml: enable graphs in the SYCL backend" ON)
option(GGML_SYCL_HOST_MEM_FALLBACK "ggml: allow host memory fallback in SYCL reorder (requires kernel 6.8+)" ON)
+option(GGML_SYCL_SUPPORT_LEVEL_ZERO "ggml: use Level Zero API in SYCL backend" ON)
option(GGML_SYCL_DNN "ggml: enable oneDNN in the SYCL backend" ON)
set (GGML_SYCL_TARGET "INTEL" CACHE STRING
"ggml: sycl target device")
diff --git a/ggml/src/ggml-sycl/CMakeLists.txt b/ggml/src/ggml-sycl/CMakeLists.txt
index 8f44c6ed0..180de9220 100644
--- a/ggml/src/ggml-sycl/CMakeLists.txt
+++ b/ggml/src/ggml-sycl/CMakeLists.txt
@@ -39,6 +39,18 @@ if (WIN32)
set(CMAKE_CXX_COMPILER "icx")
set(CMAKE_CXX_COMPILER_ID "IntelLLVM")
endif()
+ # Level Zero SDK path for Windows (only when GGML_SYCL_SUPPORT_LEVEL_ZERO is enabled)
+ if(GGML_SYCL_SUPPORT_LEVEL_ZERO)
+ if(DEFINED ENV{LEVEL_ZERO_V1_SDK_PATH})
+ set(LEVEL_ZERO_V1_SDK_PATH $ENV{LEVEL_ZERO_V1_SDK_PATH})
+ if(EXISTS "${LEVEL_ZERO_V1_SDK_PATH}")
+ target_include_directories(ggml-sycl PRIVATE "${LEVEL_ZERO_V1_SDK_PATH}/include")
+ set(LEVEL_ZERO_V1_SDK_LIB_PATH "${LEVEL_ZERO_V1_SDK_PATH}/lib")
+ else()
+ message(WARNING "LEVEL_ZERO_V1_SDK_PATH set but folder not found: ${LEVEL_ZERO_V1_SDK_PATH}")
+ endif()
+ endif()
+ endif()
endif()
macro(detect_and_find_package package_name)
@@ -93,6 +105,23 @@ endif()
target_compile_options(ggml-sycl PRIVATE "-Wno-narrowing")
+message(STATUS "GGML_SYCL_SUPPORT_LEVEL_ZERO ${GGML_SYCL_SUPPORT_LEVEL_ZERO}")
+if (GGML_SYCL_SUPPORT_LEVEL_ZERO)
+ # Link against Level Zero loader for direct device memory allocation.
+ # Avoids sycl::malloc_device triggering DMA-buf/TTM system RAM staging
+ # in the xe kernel driver during multi-GPU inference.
+ find_path(LEVEL_ZERO_INCLUDE_DIR level_zero/ze_api.h HINTS ${ONEAPI_ROOT}/include ${LEVEL_ZERO_V1_SDK_PATH}/include)
+ find_library(ZE_LOADER_LIB ze_loader HINTS ${ONEAPI_ROOT}/lib ${LEVEL_ZERO_V1_SDK_LIB_PATH} ENV LD_LIBRARY_PATH)
+ if(ZE_LOADER_LIB AND LEVEL_ZERO_INCLUDE_DIR)
+ target_link_libraries(ggml-sycl PRIVATE ${ZE_LOADER_LIB})
+ target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_SUPPORT_LEVEL_ZERO)
+ message(STATUS "Level Zero loader found: ${ZE_LOADER_LIB}")
+ message(STATUS "Level Zero headers found: ${LEVEL_ZERO_INCLUDE_DIR}")
+ else()
+ message(WARNING "Level Zero loader or headers not found, Level Zero support disabled")
+ endif()
+endif()
+
# Link against oneDNN
set(GGML_SYCL_DNNL 0)
if(GGML_SYCL_DNN)
diff --git a/ggml/src/ggml-sycl/common.cpp b/ggml/src/ggml-sycl/common.cpp
index 05fd5ef46..ae08abad8 100644
--- a/ggml/src/ggml-sycl/common.cpp
+++ b/ggml/src/ggml-sycl/common.cpp
@@ -11,6 +11,10 @@
//
#include "common.hpp"
+#include
+#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
+#include
+#endif
#include "ggml-backend-impl.h"
#include "ggml-impl.h"
@@ -55,6 +59,20 @@ bool gpu_has_xmx(sycl::device &dev) {
return dev.has(sycl::aspect::ext_intel_matrix);
}
+static int ggml_sycl_get_env(const char *env_name, int default_val) {
+ char *user_device_string = getenv(env_name);
+ int user_number = default_val;
+
+ unsigned n;
+ if (user_device_string != NULL &&
+ sscanf(user_device_string, " %u", &n) == 1) {
+ user_number = (int)n;
+ } else {
+ user_number = default_val;
+ }
+ return user_number;
+}
+
int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block_size) {
const int64_t max_range = std::numeric_limits::max();
int64_t sycl_down_blk_size = block_size;
@@ -66,6 +84,61 @@ int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block
return sycl_down_blk_size;
}
+#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
+static bool ggml_sycl_use_level_zero_device_alloc(sycl::queue &q) {
+ return ggml_sycl_get_env("GGML_SYCL_ENABLE_LEVEL_ZERO", 1) &&
+ q.get_device().is_gpu() &&
+ q.get_backend() == sycl::backend::ext_oneapi_level_zero;
+}
+#endif
+
+// Use Level Zero zeMemAllocDevice to avoid sycl::malloc_device triggering
+// DMA-buf/TTM system RAM staging in the xe kernel driver during multi-GPU inference.
+// The decision is made from the queue and runtime env because large buffers can be
+// allocated before ggml_check_sycl() initializes g_ggml_sycl_enable_level_zero.
+void * ggml_sycl_malloc_device(size_t size, sycl::queue &q) {
+#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
+ if (ggml_sycl_use_level_zero_device_alloc(q)) {
+ void *ptr = nullptr;
+ auto ze_ctx = sycl::get_native(q.get_context());
+ auto ze_dev = sycl::get_native(q.get_device());
+#ifdef ZE_RELAXED_ALLOCATION_LIMITS_EXP_NAME
+ ze_relaxed_allocation_limits_exp_desc_t relaxed_desc = {
+ ZE_STRUCTURE_TYPE_RELAXED_ALLOCATION_LIMITS_EXP_DESC,
+ nullptr,
+ ZE_RELAXED_ALLOCATION_LIMITS_EXP_FLAG_MAX_SIZE,
+ };
+ ze_device_mem_alloc_desc_t alloc_desc = {
+ ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC,
+ &relaxed_desc,
+ 0,
+ 0,
+ };
+#else
+ ze_device_mem_alloc_desc_t alloc_desc = {ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC, nullptr, 0, 0};
+#endif
+ ze_result_t r = zeMemAllocDevice(ze_ctx, &alloc_desc, size, 64, ze_dev, &ptr);
+ if (r == ZE_RESULT_SUCCESS && ptr) {
+ return ptr;
+ }
+ return nullptr;
+ }
+#endif
+ return sycl::malloc_device(size, q);
+}
+
+void ggml_sycl_free_device(void *ptr, sycl::queue &q) {
+ if (!ptr) return;
+#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
+ if (ggml_sycl_use_level_zero_device_alloc(q)) {
+ auto ze_ctx = sycl::get_native(q.get_context());
+ zeMemFree(ze_ctx, ptr);
+ return;
+ }
+#endif
+ SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(ptr, q)));
+}
+
void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector streams) {
for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
for (int64_t is = 0; is < GGML_SYCL_MAX_STREAMS; ++is) {
@@ -75,8 +148,7 @@ void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector str
}
if (extra->data_device[i] != nullptr && streams.size()>0) {
ggml_sycl_set_device(i);
- SYCL_CHECK(
- CHECK_TRY_ERROR(sycl::free(extra->data_device[i], *(streams[i]))));
+ SYCL_CHECK(CHECK_TRY_ERROR(ggml_sycl_free_device(extra->data_device[i], *(streams[i]))));
}
}
delete extra;
diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp
index eec36e8db..96bc1c98b 100644
--- a/ggml/src/ggml-sycl/common.hpp
+++ b/ggml/src/ggml-sycl/common.hpp
@@ -310,6 +310,10 @@ struct ggml_tensor_extra_gpu {
optimize_feature optimized_feature;
};
+extern int g_ggml_sycl_enable_level_zero;
+void * ggml_sycl_malloc_device(size_t size, sycl::queue &q);
+void ggml_sycl_free_device(void *ptr, sycl::queue &q);
+
void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector streams={});
namespace sycl_ex = sycl::ext::oneapi::experimental;
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index 57cc4ffb6..f5d10b56d 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -30,6 +30,10 @@
#include
#include
+#include
+#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
+#include
+#endif
#if defined(GGML_SYCL_GRAPH) && SYCL_EXT_ONEAPI_ASYNC_MEMORY_ALLOC
# include
#endif
@@ -68,6 +72,7 @@ int g_ggml_sycl_disable_graph = 0;
int g_ggml_sycl_disable_dnn = 0;
int g_ggml_sycl_prioritize_dmmv = 0;
int g_ggml_sycl_use_async_mem_op = 0;
+int g_ggml_sycl_enable_level_zero = 0;
int g_ggml_sycl_enable_flash_attention = 1;
@@ -223,6 +228,27 @@ static void ggml_check_sycl() try {
g_ggml_sycl_disable_graph = get_sycl_env("GGML_SYCL_DISABLE_GRAPH", 1);
g_ggml_sycl_disable_dnn = get_sycl_env("GGML_SYCL_DISABLE_DNN", 0);
g_ggml_sycl_prioritize_dmmv = get_sycl_env("GGML_SYCL_PRIORITIZE_DMMV", 0);
+#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
+ g_ggml_sycl_enable_level_zero = get_sycl_env("GGML_SYCL_ENABLE_LEVEL_ZERO", 1);
+#else
+ g_ggml_sycl_enable_level_zero = 0;
+#endif
+ if (g_ggml_sycl_enable_level_zero) {
+ // Verify all GPU devices use the Level Zero backend before enabling L0 APIs.
+ // Only check GPU devices; CPU devices use OpenCL and would otherwise
+ // disable Level Zero for the GPUs on systems without ONEAPI_DEVICE_SELECTOR set.
+ for (unsigned int i = 0; i < dpct::dev_mgr::instance().device_count(); i++) {
+ auto & q = dpct::dev_mgr::instance().get_device(i).default_queue();
+ if (!q.get_device().is_gpu()) {
+ continue;
+ }
+ if (q.get_backend() != sycl::backend::ext_oneapi_level_zero) {
+ GGML_LOG_WARN("SYCL GPU device %d does not use Level Zero backend, disabling Level Zero memory API\n", i);
+ g_ggml_sycl_enable_level_zero = 0;
+ break;
+ }
+ }
+ }
#ifdef SYCL_FLASH_ATTN
g_ggml_sycl_enable_flash_attention = get_sycl_env("GGML_SYCL_ENABLE_FLASH_ATTN", 1);
@@ -253,6 +279,11 @@ static void ggml_check_sycl() try {
#else
GGML_LOG_INFO(" GGML_SYCL_DNNL: no\n");
#endif
+#if defined(GGML_SYCL_SUPPORT_LEVEL_ZERO)
+ GGML_LOG_INFO(" GGML_SYCL_SUPPORT_LEVEL_ZERO: yes\n");
+#else
+ GGML_LOG_INFO(" GGML_SYCL_SUPPORT_LEVEL_ZERO: no\n");
+#endif
GGML_LOG_INFO("Running with Environment Variables:\n");
GGML_LOG_INFO(" GGML_SYCL_DEBUG: %d\n", g_ggml_sycl_debug);
@@ -262,6 +293,11 @@ static void ggml_check_sycl() try {
#else
GGML_LOG_INFO(" GGML_SYCL_DISABLE_GRAPH: graph disabled by compile flag\n");
#endif
+#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
+ GGML_LOG_INFO(" GGML_SYCL_ENABLE_LEVEL_ZERO: %d\n", g_ggml_sycl_enable_level_zero);
+#else
+ GGML_LOG_INFO(" GGML_SYCL_ENABLE_LEVEL_ZERO: Level Zero disabled by compile flag\n");
+#endif
#if GGML_SYCL_DNNL
GGML_LOG_INFO(" GGML_SYCL_DISABLE_DNN: %d\n", g_ggml_sycl_disable_dnn);
#else
@@ -371,7 +407,7 @@ struct ggml_backend_sycl_buffer_context {
~ggml_backend_sycl_buffer_context() {
if (dev_ptr != nullptr) {
ggml_sycl_set_device(device);
- SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(dev_ptr, *stream)));
+ SYCL_CHECK(CHECK_TRY_ERROR(ggml_sycl_free_device(dev_ptr, *stream)));
}
//release extra used by tensors
@@ -504,8 +540,43 @@ catch (sycl::exception const &exc) {
std::exit(1);
}
+#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
+static bool ggml_sycl_is_l0_discrete_gpu(sycl::queue &q) {
+ if (!q.get_device().is_gpu() || q.get_backend() != sycl::backend::ext_oneapi_level_zero) {
+ return false;
+ }
+
+ ze_device_handle_t ze_dev = sycl::get_native(q.get_device());
+ ze_device_properties_t props = {};
+ props.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
+ ze_result_t r = zeDeviceGetProperties(ze_dev, &props);
+ return r == ZE_RESULT_SUCCESS && !(props.flags & ZE_DEVICE_PROPERTY_FLAG_INTEGRATED);
+}
+#endif
+
static void dev2dev_memcpy(sycl::queue &q_dst, sycl::queue &q_src, void *ptr_dst,
const void *ptr_src, size_t size) {
+#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
+ // Use Level Zero direct copy for dGPU-to-dGPU transfers.
+ const bool l0_copy_supported =
+ ggml_sycl_is_l0_discrete_gpu(q_dst) && ggml_sycl_is_l0_discrete_gpu(q_src);
+ if (g_ggml_sycl_enable_level_zero && l0_copy_supported) {
+ auto ze_ctx = sycl::get_native(q_dst.get_context());
+ auto ze_dev = sycl::get_native(q_dst.get_device());
+ ze_command_queue_desc_t cq_desc = {ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, nullptr, 0, 0,
+ 0, ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS, ZE_COMMAND_QUEUE_PRIORITY_NORMAL};
+ ze_command_list_handle_t cl;
+ ze_result_t r = zeCommandListCreateImmediate(ze_ctx, ze_dev, &cq_desc, &cl);
+ if (r == ZE_RESULT_SUCCESS) {
+ r = zeCommandListAppendMemoryCopy(cl, ptr_dst, ptr_src, size, nullptr, 0, nullptr);
+ zeCommandListDestroy(cl);
+ if (r == ZE_RESULT_SUCCESS) {
+ return;
+ }
+ }
+ }
+#endif
+ // Host-staged copy
char *host_buf = (char *)malloc(size);
q_src.memcpy(host_buf, (const char *)ptr_src, size).wait();
q_dst.memcpy((char *)ptr_dst, host_buf, size).wait();
@@ -675,8 +746,7 @@ ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
size = std::max(size, (size_t)1); // syclMalloc returns null for size 0
void * dev_ptr;
- SYCL_CHECK(CHECK_TRY_ERROR(dev_ptr = (void *)sycl::malloc_device(
- size, *stream)));
+ SYCL_CHECK(CHECK_TRY_ERROR(dev_ptr = (void *)ggml_sycl_malloc_device(size, *stream)));
if (!dev_ptr) {
GGML_LOG_ERROR("%s: can't allocate %lu Bytes of memory on device\n", __func__, size);
return nullptr;
@@ -917,18 +987,10 @@ ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
}
- // FIXME: do not crash if SYCL Buffer alloc fails
- // currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
ggml_sycl_set_device(i);
const queue_ptr stream = ctx->streams[i];
char * buf;
- /*
- DPCT1009:208: SYCL uses exceptions to report errors and does not use the
- error codes. The original code was commented out and a warning string
- was inserted. You need to rewrite this code.
- */
- SYCL_CHECK(CHECK_TRY_ERROR(buf = (char *)sycl::malloc_device(
- size, *stream)));
+ SYCL_CHECK(CHECK_TRY_ERROR(buf = (char *)ggml_sycl_malloc_device(size, *stream)));
if (!buf) {
char err_buf[1024];
snprintf(err_buf, 1023, "%s: can't allocate %lu Bytes of memory on device\n", __func__, size);
@@ -1306,7 +1368,7 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
for (int i = 0; i < MAX_SYCL_BUFFERS; ++i) {
ggml_sycl_buffer & b = buffer_pool[i];
if (b.ptr != nullptr) {
- SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(b.ptr, *qptr)));
+ SYCL_CHECK(CHECK_TRY_ERROR(ggml_sycl_free_device(b.ptr, *qptr)));
pool_size -= b.size;
}
}
@@ -1374,9 +1436,7 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
void * ptr;
size_t look_ahead_size = (size_t) (1.05 * size);
- SYCL_CHECK(
- CHECK_TRY_ERROR(ptr = (void *)sycl::malloc_device(
- look_ahead_size, *qptr)));
+ SYCL_CHECK(CHECK_TRY_ERROR(ptr = (void *)ggml_sycl_malloc_device(look_ahead_size, *qptr)));
if (!ptr) {
GGML_LOG_ERROR("%s: can't allocate %lu Bytes of memory on device/GPU\n", __func__, look_ahead_size);
return nullptr;
@@ -1404,7 +1464,7 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
}
}
GGML_LOG_WARN("WARNING: sycl buffer pool full, increase MAX_sycl_BUFFERS\n");
- SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(ptr, *qptr)));
+ SYCL_CHECK(CHECK_TRY_ERROR(ggml_sycl_free_device(ptr, *qptr)));
pool_size -= size;
}
};
@@ -3405,7 +3465,7 @@ static inline void * sycl_ext_malloc_device(dpct::queue_ptr stream, size_t size)
// If async allocation extension is not available, use_async should always be false.
GGML_ASSERT(!use_async);
#endif
- return sycl::malloc(size, *stream, sycl::usm::alloc::device);
+ return ggml_sycl_malloc_device(size, *stream);
}
static inline void sycl_ext_free(dpct::queue_ptr stream, void * ptr) {
@@ -3419,7 +3479,7 @@ static inline void sycl_ext_free(dpct::queue_ptr stream, void * ptr) {
// If async allocation extension is not available, use_async should always be false.
GGML_ASSERT(!use_async);
#endif
- sycl::free(ptr, *stream);
+ ggml_sycl_free_device(ptr, *stream);
}
// RAII wrapper for temporary reorder buffers with optional host memory fallback.