mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 09:34:37 +00:00
Merge commit 'acd38efee3
' into concedo_experimental
# Conflicts: # .devops/cpu.Dockerfile # .devops/vulkan.Dockerfile # .github/workflows/build.yml # .github/workflows/docker.yml # CMakeLists.txt # README.md # cmake/llama-config.cmake.in # examples/simple-cmake-pkg/.gitignore # ggml/CMakeLists.txt # ggml/src/CMakeLists.txt # ggml/src/ggml-hip/CMakeLists.txt
This commit is contained in:
commit
c5d4e07664
11 changed files with 395 additions and 61 deletions
|
@ -73,6 +73,8 @@ add_compile_definitions(GGML_USE_CPU_AARCH64)
|
||||||
if (MSVC)
|
if (MSVC)
|
||||||
add_compile_options("$<$<COMPILE_LANGUAGE:C>:/utf-8>")
|
add_compile_options("$<$<COMPILE_LANGUAGE:C>:/utf-8>")
|
||||||
add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/utf-8>")
|
add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/utf-8>")
|
||||||
|
add_compile_options("$<$<COMPILE_LANGUAGE:C>:/bigobj>")
|
||||||
|
add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/bigobj>")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
file(GLOB GGML_SOURCES_CUDA "ggml/src/ggml-cuda/*.cu")
|
file(GLOB GGML_SOURCES_CUDA "ggml/src/ggml-cuda/*.cu")
|
||||||
|
|
|
@ -1427,16 +1427,16 @@ struct server_queue {
|
||||||
int post(server_task task, bool front = false) {
|
int post(server_task task, bool front = false) {
|
||||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||||
GGML_ASSERT(task.id != -1);
|
GGML_ASSERT(task.id != -1);
|
||||||
|
// if this is cancel task make sure to clean up pending tasks
|
||||||
|
if (task.type == SERVER_TASK_TYPE_CANCEL) {
|
||||||
|
cleanup_pending_task(task.id_target);
|
||||||
|
}
|
||||||
QUE_DBG("new task, id = %d, front = %d\n", task.id, front);
|
QUE_DBG("new task, id = %d, front = %d\n", task.id, front);
|
||||||
if (front) {
|
if (front) {
|
||||||
queue_tasks.push_front(std::move(task));
|
queue_tasks.push_front(std::move(task));
|
||||||
} else {
|
} else {
|
||||||
queue_tasks.push_back(std::move(task));
|
queue_tasks.push_back(std::move(task));
|
||||||
}
|
}
|
||||||
// if this is cancel task make sure to clean up pending tasks
|
|
||||||
if (task.type == SERVER_TASK_TYPE_CANCEL) {
|
|
||||||
cleanup_pending_task(task.id_target);
|
|
||||||
}
|
|
||||||
condition_tasks.notify_one();
|
condition_tasks.notify_one();
|
||||||
return task.id;
|
return task.id;
|
||||||
}
|
}
|
||||||
|
@ -1448,16 +1448,16 @@ struct server_queue {
|
||||||
if (task.id == -1) {
|
if (task.id == -1) {
|
||||||
task.id = id++;
|
task.id = id++;
|
||||||
}
|
}
|
||||||
|
// if this is cancel task make sure to clean up pending tasks
|
||||||
|
if (task.type == SERVER_TASK_TYPE_CANCEL) {
|
||||||
|
cleanup_pending_task(task.id_target);
|
||||||
|
}
|
||||||
QUE_DBG("new task, id = %d/%d, front = %d\n", task.id, (int) tasks.size(), front);
|
QUE_DBG("new task, id = %d/%d, front = %d\n", task.id, (int) tasks.size(), front);
|
||||||
if (front) {
|
if (front) {
|
||||||
queue_tasks.push_front(std::move(task));
|
queue_tasks.push_front(std::move(task));
|
||||||
} else {
|
} else {
|
||||||
queue_tasks.push_back(std::move(task));
|
queue_tasks.push_back(std::move(task));
|
||||||
}
|
}
|
||||||
// if this is cancel task make sure to clean up pending tasks
|
|
||||||
if (task.type == SERVER_TASK_TYPE_CANCEL) {
|
|
||||||
cleanup_pending_task(task.id_target);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
condition_tasks.notify_one();
|
condition_tasks.notify_one();
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -1554,10 +1554,10 @@ struct server_queue {
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
void cleanup_pending_task(int id_task) {
|
void cleanup_pending_task(int id_target) {
|
||||||
// no need lock because this is called exclusively by post()
|
// no need lock because this is called exclusively by post()
|
||||||
auto rm_func = [id_task](const server_task & task) {
|
auto rm_func = [id_target](const server_task & task) {
|
||||||
return task.id_target == id_task;
|
return task.id_target == id_target;
|
||||||
};
|
};
|
||||||
queue_tasks.erase(
|
queue_tasks.erase(
|
||||||
std::remove_if(queue_tasks.begin(), queue_tasks.end(), rm_func),
|
std::remove_if(queue_tasks.begin(), queue_tasks.end(), rm_func),
|
||||||
|
|
11
examples/simple-cmake-pkg/CMakeLists.txt
Normal file
11
examples/simple-cmake-pkg/CMakeLists.txt
Normal file
|
@ -0,0 +1,11 @@
|
||||||
|
cmake_minimum_required(VERSION 3.12)
|
||||||
|
project(llama-simple-cmake-pkg)
|
||||||
|
|
||||||
|
set(TARGET llama-simple-cmake-pkg)
|
||||||
|
|
||||||
|
find_package(Llama REQUIRED)
|
||||||
|
|
||||||
|
add_executable(${TARGET} ${CMAKE_CURRENT_LIST_DIR}/../simple/simple.cpp)
|
||||||
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
|
target_link_libraries(${TARGET} PRIVATE llama ggml::all ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
34
examples/simple-cmake-pkg/README.md
Normal file
34
examples/simple-cmake-pkg/README.md
Normal file
|
@ -0,0 +1,34 @@
|
||||||
|
# llama.cpp/example/simple-cmake-pkg
|
||||||
|
|
||||||
|
This program builds [simple](../simple) using a relocatable CMake package. It serves as an example of using the `find_package()` CMake command to conveniently include [llama.cpp](https://github.com/ggerganov/llama.cpp) in projects which live outside of the source tree.
|
||||||
|
|
||||||
|
## Building
|
||||||
|
|
||||||
|
Because this example is "outside of the source tree", it is important to first build/install llama.cpp using CMake. An example is provided here, but please see the [llama.cpp build instructions](../..) for more detailed build instructions.
|
||||||
|
|
||||||
|
### Considerations
|
||||||
|
|
||||||
|
When hardware acceleration libraries are used (e.g. CUDA, Metal, Vulkan, etc.), the appropriate dependencies will be searched for automatically. So, for example, when finding a package
|
||||||
|
|
||||||
|
### Build llama.cpp and install to llama.cpp/inst
|
||||||
|
|
||||||
|
```sh
|
||||||
|
git clone https://github.com/ggerganov/llama.cpp
|
||||||
|
cd llama.cpp
|
||||||
|
cmake -S . -B build
|
||||||
|
cmake --build build
|
||||||
|
cmake --install build --prefix inst
|
||||||
|
|
||||||
|
### Build simple-cmake-pkg
|
||||||
|
|
||||||
|
```sh
|
||||||
|
cd examples/simple-cmake-pkg
|
||||||
|
cmake -S . -B build -DCMAKE_PREFIX_PATH=../../inst/lib/cmake
|
||||||
|
cmake --build build
|
||||||
|
```
|
||||||
|
|
||||||
|
### Run simple-cmake-pkg
|
||||||
|
|
||||||
|
```sh
|
||||||
|
./build/llama-simple-cmake-pkg -m ./models/llama-7b-v2/ggml-model-f16.gguf "Hello my name is"
|
||||||
|
```
|
147
ggml/cmake/ggml-config.cmake.in
Normal file
147
ggml/cmake/ggml-config.cmake.in
Normal file
|
@ -0,0 +1,147 @@
|
||||||
|
|
||||||
|
@GGML_VARIABLES_EXPANDED@
|
||||||
|
|
||||||
|
@PACKAGE_INIT@
|
||||||
|
|
||||||
|
set_and_check(GGML_INCLUDE_DIR "@PACKAGE_GGML_INCLUDE_INSTALL_DIR@")
|
||||||
|
set_and_check(GGML_LIB_DIR "@PACKAGE_GGML_LIB_INSTALL_DIR@")
|
||||||
|
set_and_check(GGML_BIN_DIR "@PACKAGE_GGML_BIN_INSTALL_DIR@")
|
||||||
|
|
||||||
|
find_package(Threads REQUIRED)
|
||||||
|
|
||||||
|
find_library(GGML_LIBRARY ggml
|
||||||
|
REQUIRED
|
||||||
|
HINTS ${GGML_LIB_DIR}
|
||||||
|
NO_CMAKE_FIND_ROOT_PATH)
|
||||||
|
|
||||||
|
add_library(ggml::ggml UNKNOWN IMPORTED)
|
||||||
|
set_target_properties(ggml::ggml
|
||||||
|
PROPERTIES
|
||||||
|
IMPORTED_LOCATION "${GGML_LIBRARY}")
|
||||||
|
|
||||||
|
find_library(GGML_BASE_LIBRARY ggml-base
|
||||||
|
REQUIRED
|
||||||
|
HINTS ${GGML_LIB_DIR}
|
||||||
|
NO_CMAKE_FIND_ROOT_PATH)
|
||||||
|
|
||||||
|
add_library(ggml::ggml-base UNKNOWN IMPORTED)
|
||||||
|
set_target_properties(ggml::ggml-base
|
||||||
|
PROPERTIES
|
||||||
|
IMPORTED_LOCATION "${GGML_BASE_LIBRARY}")
|
||||||
|
|
||||||
|
if (NOT GGML_SHARED_LIB)
|
||||||
|
if (APPLE AND GGML_ACCELERATE)
|
||||||
|
find_library(ACCELERATE_FRAMEWORK Accelerate REQUIRED)
|
||||||
|
list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES ${ACCELERATE_FRAMEWORK})
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (GGML_OPENMP)
|
||||||
|
find_package(OpenMP REQUIRED)
|
||||||
|
list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (GGML_CPU_HBM)
|
||||||
|
find_library(memkind memkind REQUIRED)
|
||||||
|
list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES memkind)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (GGML_BLAS)
|
||||||
|
find_package(BLAS REQUIRED)
|
||||||
|
list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES ${BLAS_LIBRARIES})
|
||||||
|
list(APPEND GGML_CPU_INTERFACE_LINK_OPTIONS ${BLAS_LINKER_FLAGS})
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (GGML_CUDA)
|
||||||
|
find_package(CUDAToolkit REQUIRED)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (GGML_METAL)
|
||||||
|
find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
|
||||||
|
find_library(METAL_FRAMEWORK Metal REQUIRED)
|
||||||
|
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
|
||||||
|
|
||||||
|
list(APPEND GGML_METAL_INTERFACE_LINK_LIBRARIES
|
||||||
|
${FOUNDATION_LIBRARY} ${METAL_FRAMEWORK} ${METALKIT_FRAMEWORK})
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (GGML_VULKAN)
|
||||||
|
find_package(Vulkan REQUIRED)
|
||||||
|
list(APPEND GGML_VULKAN_INTERFACE_LINK_LIBRARIES Vulkan::Vulkan)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (GGML_HIP)
|
||||||
|
find_package(hip REQUIRED)
|
||||||
|
find_package(hipblas REQUIRED)
|
||||||
|
find_package(rocblas REQUIRED)
|
||||||
|
list(APPEND GGML_HIP_INTERFACE_LINK_LIBRARIES hip::host roc::rocblas roc::hipblas)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (GGML_SYCL)
|
||||||
|
find_package(DNNL)
|
||||||
|
if (${DNNL_FOUND} AND GGML_SYCL_TARGET STREQUAL "INTEL")
|
||||||
|
list(APPEND GGML_SYCL_INTERFACE_LINK_LIBRARIES DNNL::dnnl)
|
||||||
|
endif()
|
||||||
|
if (WIN32)
|
||||||
|
find_package(IntelSYCL REQUIRED)
|
||||||
|
find_package(MKL REQUIRED)
|
||||||
|
list(APPEND GGML_SYCL_INTERFACE_LINK_LIBRARIES IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
|
set(_ggml_all_targets "")
|
||||||
|
foreach(_ggml_backend ${GGML_AVAILABLE_BACKENDS})
|
||||||
|
string(REPLACE "-" "_" _ggml_backend_pfx "${_ggml_backend}")
|
||||||
|
string(TOUPPER "${_ggml_backend_pfx}" _ggml_backend_pfx)
|
||||||
|
|
||||||
|
find_library(${_ggml_backend_pfx}_LIBRARY ${_ggml_backend}
|
||||||
|
REQUIRED
|
||||||
|
HINTS ${GGML_LIB_DIR}
|
||||||
|
NO_CMAKE_FIND_ROOT_PATH)
|
||||||
|
|
||||||
|
message(STATUS "Found ${${_ggml_backend_pfx}_LIBRARY}")
|
||||||
|
|
||||||
|
add_library(ggml::${_ggml_backend} UNKNOWN IMPORTED)
|
||||||
|
set_target_properties(ggml::${_ggml_backend}
|
||||||
|
PROPERTIES
|
||||||
|
INTERFACE_INCLUDE_DIRECTORIES "${GGML_INCLUDE_DIR}"
|
||||||
|
IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
|
||||||
|
IMPORTED_LOCATION "${${_ggml_backend_pfx}_LIBRARY}"
|
||||||
|
INTERFACE_COMPILE_FEATURES c_std_90
|
||||||
|
POSITION_INDEPENDENT_CODE ON)
|
||||||
|
|
||||||
|
string(REGEX MATCH "^ggml-cpu" is_cpu_variant "${_ggml_backend}")
|
||||||
|
if(is_cpu_variant)
|
||||||
|
list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES "ggml::ggml" "ggml::ggml-base")
|
||||||
|
set_target_properties(ggml::${_ggml_backend}
|
||||||
|
PROPERTIES
|
||||||
|
INTERFACE_LINK_LIBRARIES "${GGML_CPU_INTERFACE_LINK_LIBRARIES}")
|
||||||
|
|
||||||
|
if(GGML_CPU_INTERFACE_LINK_OPTIONS)
|
||||||
|
set_target_properties(ggml::${_ggml_backend}
|
||||||
|
PROPERTIES
|
||||||
|
INTERFACE_LINK_OPTIONS "${GGML_CPU_INTERFACE_LINK_OPTIONS}")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
else()
|
||||||
|
list(APPEND ${_ggml_backend_pfx}_INTERFACE_LINK_LIBRARIES "ggml::ggml" "ggml::ggml-base")
|
||||||
|
set_target_properties(ggml::${_ggml_backend}
|
||||||
|
PROPERTIES
|
||||||
|
INTERFACE_LINK_LIBRARIES "${${_ggml_backend_pfx}_INTERFACE_LINK_LIBRARIES}")
|
||||||
|
|
||||||
|
if(${_ggml_backend_pfx}_INTERFACE_LINK_OPTIONS)
|
||||||
|
set_target_properties(ggml::${_ggml_backend}
|
||||||
|
PROPERTIES
|
||||||
|
INTERFACE_LINK_OPTIONS "${${_ggml_backend_pfx}_INTERFACE_LINK_OPTIONS}")
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
|
list(APPEND _ggml_all_targets ggml::${_ggml_backend})
|
||||||
|
endforeach()
|
||||||
|
|
||||||
|
add_library(ggml::all INTERFACE IMPORTED)
|
||||||
|
set_target_properties(ggml::all
|
||||||
|
PROPERTIES
|
||||||
|
INTERFACE_LINK_LIBRARIES "${_ggml_all_targets}")
|
||||||
|
|
||||||
|
check_required_components(ggml)
|
|
@ -131,6 +131,10 @@ typedef float dfloat; // dequantize float
|
||||||
typedef float2 dfloat2;
|
typedef float2 dfloat2;
|
||||||
#endif // GGML_CUDA_F16
|
#endif // GGML_CUDA_F16
|
||||||
|
|
||||||
|
#if (!defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)) || (defined(GGML_USE_HIP) && !defined(GGML_HIP_NO_VMM))
|
||||||
|
#define GGML_USE_VMM
|
||||||
|
#endif // (!defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)) || (defined(GGML_USE_HIP) && !defined(GGML_HIP_NO_VMM))
|
||||||
|
|
||||||
#if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
|
#if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
|
||||||
#define FP16_AVAILABLE
|
#define FP16_AVAILABLE
|
||||||
#endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
|
#endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
|
||||||
|
|
|
@ -155,7 +155,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||||
for (int id = 0; id < info.device_count; ++id) {
|
for (int id = 0; id < info.device_count; ++id) {
|
||||||
int device_vmm = 0;
|
int device_vmm = 0;
|
||||||
|
|
||||||
#if !defined(GGML_CUDA_NO_VMM)
|
#if defined(GGML_USE_VMM)
|
||||||
CUdevice device;
|
CUdevice device;
|
||||||
CU_CHECK(cuDeviceGet(&device, id));
|
CU_CHECK(cuDeviceGet(&device, id));
|
||||||
CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
|
CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
|
||||||
|
@ -167,7 +167,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||||
alloc_prop.location.id = id;
|
alloc_prop.location.id = id;
|
||||||
CU_CHECK(cuMemGetAllocationGranularity(&info.devices[id].vmm_granularity, &alloc_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
|
CU_CHECK(cuMemGetAllocationGranularity(&info.devices[id].vmm_granularity, &alloc_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
|
||||||
}
|
}
|
||||||
#endif // !defined(GGML_CUDA_NO_VMM)
|
#endif // defined(GGML_USE_VMM)
|
||||||
info.devices[id].vmm = !!device_vmm;
|
info.devices[id].vmm = !!device_vmm;
|
||||||
|
|
||||||
cudaDeviceProp prop;
|
cudaDeviceProp prop;
|
||||||
|
@ -301,7 +301,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
||||||
};
|
};
|
||||||
|
|
||||||
// pool with virtual memory
|
// pool with virtual memory
|
||||||
#if !defined(GGML_CUDA_NO_VMM)
|
#if defined(GGML_USE_VMM)
|
||||||
struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
||||||
static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
|
static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
|
||||||
|
|
||||||
|
@ -409,14 +409,14 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
||||||
GGML_ASSERT(ptr == (void *) ((char *)(pool_addr) + pool_used));
|
GGML_ASSERT(ptr == (void *) ((char *)(pool_addr) + pool_used));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
#endif // !defined(GGML_CUDA_NO_VMM)
|
#endif // defined(GGML_USE_VMM)
|
||||||
|
|
||||||
std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device) {
|
std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device) {
|
||||||
#if !defined(GGML_CUDA_NO_VMM)
|
#if defined(GGML_USE_VMM)
|
||||||
if (ggml_cuda_info().devices[device].vmm) {
|
if (ggml_cuda_info().devices[device].vmm) {
|
||||||
return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device));
|
return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device));
|
||||||
}
|
}
|
||||||
#endif // !defined(GGML_CUDA_NO_VMM)
|
#endif // defined(GGML_USE_VMM)
|
||||||
return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_leg(device));
|
return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_leg(device));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3256,7 +3256,7 @@ static ggml_backend_feature * ggml_backend_cuda_get_features(ggml_backend_reg_t
|
||||||
features.push_back({ "FORCE_CUBLAS", "1" });
|
features.push_back({ "FORCE_CUBLAS", "1" });
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef GGML_CUDA_NO_VMM
|
#ifndef GGML_USE_VMM
|
||||||
features.push_back({ "NO_VMM", "1" });
|
features.push_back({ "NO_VMM", "1" });
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -19,7 +19,10 @@
|
||||||
// max number of MTLCommandBuffer used to submit a graph for processing
|
// max number of MTLCommandBuffer used to submit a graph for processing
|
||||||
#define GGML_METAL_MAX_COMMAND_BUFFERS 8
|
#define GGML_METAL_MAX_COMMAND_BUFFERS 8
|
||||||
|
|
||||||
#define UNUSED(x) (void)(x)
|
// create residency sets only on macOS >= 15.0
|
||||||
|
#if TARGET_OS_OSX && __MAC_OS_X_VERSION_MAX_ALLOWED >= 150000
|
||||||
|
#define GGML_METAL_HAS_RESIDENCY_SETS 1
|
||||||
|
#endif
|
||||||
|
|
||||||
// globals
|
// globals
|
||||||
|
|
||||||
|
@ -39,6 +42,7 @@ static struct ggml_backend_metal_device_context {
|
||||||
|
|
||||||
bool has_simdgroup_reduction;
|
bool has_simdgroup_reduction;
|
||||||
bool has_simdgroup_mm;
|
bool has_simdgroup_mm;
|
||||||
|
bool has_residency_sets;
|
||||||
bool has_bfloat;
|
bool has_bfloat;
|
||||||
bool use_bfloat;
|
bool use_bfloat;
|
||||||
|
|
||||||
|
@ -48,6 +52,7 @@ static struct ggml_backend_metal_device_context {
|
||||||
/*.mtl_device_ref_count =*/ 0,
|
/*.mtl_device_ref_count =*/ 0,
|
||||||
/*.has_simdgroup_reduction =*/ false,
|
/*.has_simdgroup_reduction =*/ false,
|
||||||
/*.has_simdgroup_mm =*/ false,
|
/*.has_simdgroup_mm =*/ false,
|
||||||
|
/*.has_residency_sets =*/ false,
|
||||||
/*.has_bfloat =*/ false,
|
/*.has_bfloat =*/ false,
|
||||||
/*.use_bfloat =*/ false,
|
/*.use_bfloat =*/ false,
|
||||||
/*.name =*/ "",
|
/*.name =*/ "",
|
||||||
|
@ -59,12 +64,18 @@ static id<MTLDevice> ggml_backend_metal_device_acq(struct ggml_backend_metal_dev
|
||||||
|
|
||||||
if (ctx->mtl_device == nil) {
|
if (ctx->mtl_device == nil) {
|
||||||
ctx->mtl_device = MTLCreateSystemDefaultDevice();
|
ctx->mtl_device = MTLCreateSystemDefaultDevice();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ctx->mtl_device) {
|
||||||
ctx->has_simdgroup_reduction = [ctx->mtl_device supportsFamily:MTLGPUFamilyApple7];
|
ctx->has_simdgroup_reduction = [ctx->mtl_device supportsFamily:MTLGPUFamilyApple7];
|
||||||
ctx->has_simdgroup_reduction |= [ctx->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML];
|
ctx->has_simdgroup_reduction |= [ctx->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML];
|
||||||
|
|
||||||
ctx->has_simdgroup_mm = [ctx->mtl_device supportsFamily:MTLGPUFamilyApple7];
|
ctx->has_simdgroup_mm = [ctx->mtl_device supportsFamily:MTLGPUFamilyApple7];
|
||||||
|
|
||||||
|
#if defined(GGML_METAL_HAS_RESIDENCY_SETS)
|
||||||
|
ctx->has_residency_sets = getenv("GGML_METAL_NO_RESIDENCY") == NULL;
|
||||||
|
#endif
|
||||||
|
|
||||||
ctx->has_bfloat = [ctx->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML];
|
ctx->has_bfloat = [ctx->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML];
|
||||||
ctx->has_bfloat |= [ctx->mtl_device supportsFamily:MTLGPUFamilyApple6];
|
ctx->has_bfloat |= [ctx->mtl_device supportsFamily:MTLGPUFamilyApple6];
|
||||||
|
|
||||||
|
@ -90,8 +101,10 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
|
||||||
ctx->mtl_device_ref_count--;
|
ctx->mtl_device_ref_count--;
|
||||||
|
|
||||||
if (ctx->mtl_device_ref_count == 0) {
|
if (ctx->mtl_device_ref_count == 0) {
|
||||||
[ctx->mtl_device release];
|
if (ctx->mtl_device) {
|
||||||
ctx->mtl_device = nil;
|
[ctx->mtl_device release];
|
||||||
|
ctx->mtl_device = nil;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -483,6 +496,11 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
|
||||||
GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]);
|
GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]);
|
||||||
|
|
||||||
ctx->queue = [device newCommandQueue];
|
ctx->queue = [device newCommandQueue];
|
||||||
|
if (ctx->queue == nil) {
|
||||||
|
GGML_LOG_ERROR("%s: error: failed to create command queue\n", __func__);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
|
ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
|
||||||
|
|
||||||
id<MTLLibrary> metal_library;
|
id<MTLLibrary> metal_library;
|
||||||
|
@ -649,6 +667,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
|
||||||
|
|
||||||
GGML_LOG_INFO("%s: simdgroup reduction = %s\n", __func__, ctx_dev->has_simdgroup_reduction ? "true" : "false");
|
GGML_LOG_INFO("%s: simdgroup reduction = %s\n", __func__, ctx_dev->has_simdgroup_reduction ? "true" : "false");
|
||||||
GGML_LOG_INFO("%s: simdgroup matrix mul. = %s\n", __func__, ctx_dev->has_simdgroup_mm ? "true" : "false");
|
GGML_LOG_INFO("%s: simdgroup matrix mul. = %s\n", __func__, ctx_dev->has_simdgroup_mm ? "true" : "false");
|
||||||
|
GGML_LOG_INFO("%s: has residency sets = %s\n", __func__, ctx_dev->has_residency_sets ? "true" : "false");
|
||||||
GGML_LOG_INFO("%s: has bfloat = %s\n", __func__, ctx_dev->has_bfloat ? "true" : "false");
|
GGML_LOG_INFO("%s: has bfloat = %s\n", __func__, ctx_dev->has_bfloat ? "true" : "false");
|
||||||
GGML_LOG_INFO("%s: use bfloat = %s\n", __func__, ctx_dev->use_bfloat ? "true" : "false");
|
GGML_LOG_INFO("%s: use bfloat = %s\n", __func__, ctx_dev->use_bfloat ? "true" : "false");
|
||||||
GGML_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx_dev->mtl_device.hasUnifiedMemory ? "true" : "false");
|
GGML_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx_dev->mtl_device.hasUnifiedMemory ? "true" : "false");
|
||||||
|
@ -1035,8 +1054,70 @@ struct ggml_backend_metal_buffer_context {
|
||||||
// multiple buffers are used only to avoid the maximum buffer size limitation when using mmap
|
// multiple buffers are used only to avoid the maximum buffer size limitation when using mmap
|
||||||
int n_buffers;
|
int n_buffers;
|
||||||
struct ggml_backend_metal_buffer buffers[GGML_METAL_MAX_BUFFERS];
|
struct ggml_backend_metal_buffer buffers[GGML_METAL_MAX_BUFFERS];
|
||||||
|
|
||||||
|
// optional MTLResidencySet
|
||||||
|
id rset;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// rset init
|
||||||
|
static bool ggml_backend_metal_buffer_rset_init(
|
||||||
|
struct ggml_backend_metal_buffer_context * ctx,
|
||||||
|
struct ggml_backend_metal_device_context * ctx_dev,
|
||||||
|
id<MTLDevice> device) {
|
||||||
|
ctx->rset = nil;
|
||||||
|
|
||||||
|
if (!ctx_dev->has_residency_sets) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
#if defined(GGML_METAL_HAS_RESIDENCY_SETS)
|
||||||
|
if (@available(macOS 15.0, *)) {
|
||||||
|
MTLResidencySetDescriptor * desc = [[MTLResidencySetDescriptor alloc] init];
|
||||||
|
desc.label = @"ggml_backend_metal";
|
||||||
|
desc.initialCapacity = ctx->n_buffers;
|
||||||
|
|
||||||
|
NSError * error;
|
||||||
|
ctx->rset = [device newResidencySetWithDescriptor:desc error:&error];
|
||||||
|
if (error) {
|
||||||
|
GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||||
|
[desc release];
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
[desc release];
|
||||||
|
|
||||||
|
for (int i = 0; i < ctx->n_buffers; i++) {
|
||||||
|
[ctx->rset addAllocation:ctx->buffers[i].metal];
|
||||||
|
}
|
||||||
|
|
||||||
|
[ctx->rset commit];
|
||||||
|
[ctx->rset requestResidency];
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
GGML_UNUSED(ctx_dev);
|
||||||
|
GGML_UNUSED(device);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// rset free
|
||||||
|
static void ggml_backend_metal_buffer_rset_free(struct ggml_backend_metal_buffer_context * ctx) {
|
||||||
|
#if defined(GGML_METAL_HAS_RESIDENCY_SETS)
|
||||||
|
if (@available(macOS 15.0, *)) {
|
||||||
|
if (ctx->rset) {
|
||||||
|
[ctx->rset endResidency];
|
||||||
|
[ctx->rset removeAllAllocations];
|
||||||
|
[ctx->rset release];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
GGML_UNUSED(ctx);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
// finds the Metal buffer that contains the tensor data on the GPU device
|
// finds the Metal buffer that contains the tensor data on the GPU device
|
||||||
// the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
|
// the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
|
||||||
// Metal buffer based on the host memory pointer
|
// Metal buffer based on the host memory pointer
|
||||||
|
@ -4176,6 +4257,8 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
|
||||||
for (int i = 0; i < ctx->n_buffers; i++) {
|
for (int i = 0; i < ctx->n_buffers; i++) {
|
||||||
[ctx->buffers[i].metal release];
|
[ctx->buffers[i].metal release];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ggml_backend_metal_buffer_rset_free(ctx);
|
||||||
ggml_backend_metal_device_rel(buffer->buft->device->context);
|
ggml_backend_metal_device_rel(buffer->buft->device->context);
|
||||||
|
|
||||||
if (ctx->owned) {
|
if (ctx->owned) {
|
||||||
|
@ -4198,19 +4281,19 @@ static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||||
static void ggml_backend_metal_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
static void ggml_backend_metal_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
||||||
memset((char *)tensor->data + offset, value, size);
|
memset((char *)tensor->data + offset, value, size);
|
||||||
|
|
||||||
UNUSED(buffer);
|
GGML_UNUSED(buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||||
memcpy((char *)tensor->data + offset, data, size);
|
memcpy((char *)tensor->data + offset, data, size);
|
||||||
|
|
||||||
UNUSED(buffer);
|
GGML_UNUSED(buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||||
memcpy(data, (const char *)tensor->data + offset, size);
|
memcpy(data, (const char *)tensor->data + offset, size);
|
||||||
|
|
||||||
UNUSED(buffer);
|
GGML_UNUSED(buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
|
static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
|
||||||
|
@ -4220,7 +4303,7 @@ static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, c
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
UNUSED(buffer);
|
GGML_UNUSED(buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
||||||
|
@ -4246,7 +4329,7 @@ static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = {
|
||||||
static const char * ggml_backend_metal_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
static const char * ggml_backend_metal_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
||||||
return "Metal";
|
return "Metal";
|
||||||
|
|
||||||
UNUSED(buft);
|
GGML_UNUSED(buft);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_metal_log_allocated_size(id<MTLDevice> device, size_t size_aligned) {
|
static void ggml_backend_metal_log_allocated_size(id<MTLDevice> device, size_t size_aligned) {
|
||||||
|
@ -4270,8 +4353,8 @@ static void ggml_backend_metal_log_allocated_size(id<MTLDevice> device, size_t s
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
UNUSED(device);
|
GGML_UNUSED(device);
|
||||||
UNUSED(size_aligned);
|
GGML_UNUSED(size_aligned);
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||||
|
@ -4284,7 +4367,8 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
|
||||||
size_aligned += (size_page - (size_aligned % size_page));
|
size_aligned += (size_page - (size_aligned % size_page));
|
||||||
}
|
}
|
||||||
|
|
||||||
id<MTLDevice> device = ggml_backend_metal_device_acq(buft->device->context);
|
struct ggml_backend_metal_device_context * ctx_dev = (struct ggml_backend_metal_device_context *)buft->device->context;
|
||||||
|
id<MTLDevice> device = ggml_backend_metal_device_acq(ctx_dev);
|
||||||
|
|
||||||
ctx->all_data = ggml_metal_host_malloc(size_aligned);
|
ctx->all_data = ggml_metal_host_malloc(size_aligned);
|
||||||
ctx->all_size = size_aligned;
|
ctx->all_size = size_aligned;
|
||||||
|
@ -4307,7 +4391,14 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
|
||||||
if (size_aligned > 0 && (ctx->all_data == NULL || ctx->buffers[0].metal == nil)) {
|
if (size_aligned > 0 && (ctx->all_data == NULL || ctx->buffers[0].metal == nil)) {
|
||||||
GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
|
GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
|
||||||
free(ctx);
|
free(ctx);
|
||||||
ggml_backend_metal_device_rel(buft->device->context);
|
ggml_backend_metal_device_rel(ctx_dev);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!ggml_backend_metal_buffer_rset_init(ctx, ctx_dev, device)) {
|
||||||
|
GGML_LOG_ERROR("%s: error: failed to initialize residency set\n", __func__);
|
||||||
|
free(ctx);
|
||||||
|
ggml_backend_metal_device_rel(ctx_dev);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -4318,7 +4409,7 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
|
||||||
|
|
||||||
static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
||||||
return 32;
|
return 32;
|
||||||
UNUSED(buft);
|
GGML_UNUSED(buft);
|
||||||
}
|
}
|
||||||
|
|
||||||
static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
|
static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
|
||||||
|
@ -4328,13 +4419,13 @@ static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend_buffer_ty
|
||||||
|
|
||||||
return max_size;
|
return max_size;
|
||||||
|
|
||||||
UNUSED(buft);
|
GGML_UNUSED(buft);
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
UNUSED(buft);
|
GGML_UNUSED(buft);
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
|
ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
|
||||||
|
@ -4357,7 +4448,7 @@ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
|
||||||
static const char * ggml_backend_metal_buffer_from_ptr_type_get_name(ggml_backend_buffer_type_t buft) {
|
static const char * ggml_backend_metal_buffer_from_ptr_type_get_name(ggml_backend_buffer_type_t buft) {
|
||||||
return "Metal_Mapped";
|
return "Metal_Mapped";
|
||||||
|
|
||||||
UNUSED(buft);
|
GGML_UNUSED(buft);
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_buffer_type_t ggml_backend_metal_buffer_from_ptr_type(void) {
|
static ggml_backend_buffer_type_t ggml_backend_metal_buffer_from_ptr_type(void) {
|
||||||
|
@ -4400,7 +4491,8 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
|
||||||
size_aligned += (size_page - (size_aligned % size_page));
|
size_aligned += (size_page - (size_aligned % size_page));
|
||||||
}
|
}
|
||||||
|
|
||||||
id<MTLDevice> device = ggml_backend_metal_device_acq(&g_ggml_ctx_dev_main);
|
struct ggml_backend_metal_device_context * ctx_dev = &g_ggml_ctx_dev_main;
|
||||||
|
id<MTLDevice> device = ggml_backend_metal_device_acq(ctx_dev);
|
||||||
|
|
||||||
// the buffer fits into the max buffer size allowed by the device
|
// the buffer fits into the max buffer size allowed by the device
|
||||||
if (size_aligned <= device.maxBufferLength) {
|
if (size_aligned <= device.maxBufferLength) {
|
||||||
|
@ -4453,6 +4545,13 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!ggml_backend_metal_buffer_rset_init(ctx, ctx_dev, device)) {
|
||||||
|
GGML_LOG_ERROR("%s: error: failed to initialize residency set\n", __func__);
|
||||||
|
free(ctx);
|
||||||
|
ggml_backend_metal_device_rel(ctx_dev);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
return ggml_backend_buffer_init(ggml_backend_metal_buffer_from_ptr_type(), ggml_backend_metal_buffer_i, ctx, size);
|
return ggml_backend_buffer_init(ggml_backend_metal_buffer_from_ptr_type(), ggml_backend_metal_buffer_i, ctx, size);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -4461,7 +4560,7 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
|
||||||
static const char * ggml_backend_metal_name(ggml_backend_t backend) {
|
static const char * ggml_backend_metal_name(ggml_backend_t backend) {
|
||||||
return "Metal";
|
return "Metal";
|
||||||
|
|
||||||
UNUSED(backend);
|
GGML_UNUSED(backend);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_metal_free(ggml_backend_t backend) {
|
static void ggml_backend_metal_free(ggml_backend_t backend) {
|
||||||
|
@ -4766,6 +4865,13 @@ static ggml_backend_buffer_t ggml_backend_metal_device_buffer_from_ptr(ggml_back
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!ggml_backend_metal_buffer_rset_init(ctx, ctx_dev, device)) {
|
||||||
|
GGML_LOG_ERROR("%s: error: failed to initialize residency set\n", __func__);
|
||||||
|
free(ctx);
|
||||||
|
ggml_backend_metal_device_rel(ctx_dev);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
return ggml_backend_buffer_init(ggml_backend_metal_buffer_from_ptr_type(), ggml_backend_metal_buffer_i, ctx, size);
|
return ggml_backend_buffer_init(ggml_backend_metal_buffer_from_ptr_type(), ggml_backend_metal_buffer_i, ctx, size);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -4779,7 +4885,7 @@ static bool ggml_backend_metal_device_supports_buft(ggml_backend_dev_t dev, ggml
|
||||||
return buft->iface.get_name == ggml_backend_metal_buffer_type_get_name ||
|
return buft->iface.get_name == ggml_backend_metal_buffer_type_get_name ||
|
||||||
buft->iface.get_name == ggml_backend_metal_buffer_from_ptr_type_get_name;
|
buft->iface.get_name == ggml_backend_metal_buffer_from_ptr_type_get_name;
|
||||||
|
|
||||||
UNUSED(dev);
|
GGML_UNUSED(dev);
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_backend_metal_device_offload_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
|
static bool ggml_backend_metal_device_offload_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
|
||||||
|
|
|
@ -89,6 +89,10 @@ struct vk_pipeline_struct {
|
||||||
uint32_t parameter_count;
|
uint32_t parameter_count;
|
||||||
std::array<uint32_t, 3> wg_denoms;
|
std::array<uint32_t, 3> wg_denoms;
|
||||||
uint32_t align;
|
uint32_t align;
|
||||||
|
// set to true to request the pipeline is compiled after the dryrun
|
||||||
|
bool needed {};
|
||||||
|
// set to true when the shader has been compiled
|
||||||
|
bool compiled {};
|
||||||
};
|
};
|
||||||
|
|
||||||
typedef std::shared_ptr<vk_pipeline_struct> vk_pipeline;
|
typedef std::shared_ptr<vk_pipeline_struct> vk_pipeline;
|
||||||
|
@ -190,8 +194,11 @@ struct vk_device_struct {
|
||||||
bool mul_mat_id_m;
|
bool mul_mat_id_m;
|
||||||
bool mul_mat_id_s;
|
bool mul_mat_id_s;
|
||||||
|
|
||||||
vk_matmul_pipeline pipeline_matmul_f32;
|
// set to true to indicate that some shaders need to be compiled after the dryrun
|
||||||
vk_matmul_pipeline pipeline_matmul_f32_f16;
|
bool need_compiles {};
|
||||||
|
|
||||||
|
vk_matmul_pipeline pipeline_matmul_f32 {};
|
||||||
|
vk_matmul_pipeline pipeline_matmul_f32_f16 {};
|
||||||
vk_matmul_pipeline2 pipeline_matmul_f16;
|
vk_matmul_pipeline2 pipeline_matmul_f16;
|
||||||
vk_matmul_pipeline2 pipeline_matmul_f16_f32;
|
vk_matmul_pipeline2 pipeline_matmul_f16_f32;
|
||||||
vk_pipeline pipeline_matmul_split_k_reduce;
|
vk_pipeline pipeline_matmul_split_k_reduce;
|
||||||
|
@ -199,7 +206,7 @@ struct vk_device_struct {
|
||||||
vk_matmul_pipeline2 pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_COUNT];
|
vk_matmul_pipeline2 pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_COUNT];
|
||||||
vk_matmul_pipeline2 pipeline_dequant_mul_mat_mat[GGML_TYPE_COUNT];
|
vk_matmul_pipeline2 pipeline_dequant_mul_mat_mat[GGML_TYPE_COUNT];
|
||||||
|
|
||||||
vk_matmul_pipeline pipeline_matmul_id_f32;
|
vk_matmul_pipeline pipeline_matmul_id_f32 {};
|
||||||
vk_matmul_pipeline2 pipeline_matmul_id_f16;
|
vk_matmul_pipeline2 pipeline_matmul_id_f16;
|
||||||
vk_matmul_pipeline2 pipeline_matmul_id_f16_f32;
|
vk_matmul_pipeline2 pipeline_matmul_id_f16_f32;
|
||||||
|
|
||||||
|
@ -780,13 +787,6 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
|
||||||
GGML_ASSERT(parameter_count > 0);
|
GGML_ASSERT(parameter_count > 0);
|
||||||
GGML_ASSERT(wg_denoms[0] > 0 && wg_denoms[1] > 0 && wg_denoms[2] > 0); // NOLINT
|
GGML_ASSERT(wg_denoms[0] > 0 && wg_denoms[1] > 0 && wg_denoms[2] > 0); // NOLINT
|
||||||
|
|
||||||
pipeline = std::make_shared<vk_pipeline_struct>();
|
|
||||||
pipeline->name = name;
|
|
||||||
pipeline->parameter_count = parameter_count;
|
|
||||||
pipeline->push_constant_size = push_constant_size;
|
|
||||||
pipeline->wg_denoms = wg_denoms;
|
|
||||||
pipeline->align = align;
|
|
||||||
|
|
||||||
vk::ShaderModuleCreateInfo shader_module_create_info({}, spv_size, reinterpret_cast<const uint32_t *>(spv_data));
|
vk::ShaderModuleCreateInfo shader_module_create_info({}, spv_size, reinterpret_cast<const uint32_t *>(spv_data));
|
||||||
pipeline->shader_module = device->device.createShaderModule(shader_module_create_info);
|
pipeline->shader_module = device->device.createShaderModule(shader_module_create_info);
|
||||||
|
|
||||||
|
@ -869,6 +869,7 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
|
||||||
}
|
}
|
||||||
|
|
||||||
pipeline->pipeline = device->device.createComputePipeline(VK_NULL_HANDLE, compute_pipeline_create_info).value;
|
pipeline->pipeline = device->device.createComputePipeline(VK_NULL_HANDLE, compute_pipeline_create_info).value;
|
||||||
|
pipeline->compiled = true;
|
||||||
|
|
||||||
{
|
{
|
||||||
std::lock_guard<std::mutex> guard(device->mutex);
|
std::lock_guard<std::mutex> guard(device->mutex);
|
||||||
|
@ -879,12 +880,6 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
|
||||||
std::lock_guard<std::mutex> guard(compile_count_mutex);
|
std::lock_guard<std::mutex> guard(compile_count_mutex);
|
||||||
assert(compile_count > 0);
|
assert(compile_count > 0);
|
||||||
compile_count--;
|
compile_count--;
|
||||||
|
|
||||||
// "Progress bar" for shader compiles
|
|
||||||
static uint32_t total_compile_count = 0;
|
|
||||||
if ((total_compile_count++ % 10) == 0) {
|
|
||||||
std::cerr << ".";
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
compile_count_cond.notify_all();
|
compile_count_cond.notify_all();
|
||||||
}
|
}
|
||||||
|
@ -910,6 +905,10 @@ static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline)
|
||||||
static void ggml_pipeline_request_descriptor_sets(vk_device& device, vk_pipeline& pipeline, uint32_t n) {
|
static void ggml_pipeline_request_descriptor_sets(vk_device& device, vk_pipeline& pipeline, uint32_t n) {
|
||||||
VK_LOG_DEBUG("ggml_pipeline_request_descriptor_sets(" << pipeline->name << ", " << n << ")");
|
VK_LOG_DEBUG("ggml_pipeline_request_descriptor_sets(" << pipeline->name << ", " << n << ")");
|
||||||
device->pipeline_descriptor_set_requirements[pipeline->name] += n;
|
device->pipeline_descriptor_set_requirements[pipeline->name] += n;
|
||||||
|
if (!pipeline->compiled) {
|
||||||
|
pipeline->needed = true;
|
||||||
|
device->need_compiles = true;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_pipeline_allocate_descriptor_sets(vk_device& device) {
|
static void ggml_pipeline_allocate_descriptor_sets(vk_device& device) {
|
||||||
|
@ -1392,8 +1391,6 @@ static bool ggml_vk_matmul_shmem_support(const vk_device& device, const std::vec
|
||||||
static void ggml_vk_load_shaders(vk_device& device) {
|
static void ggml_vk_load_shaders(vk_device& device) {
|
||||||
VK_LOG_DEBUG("ggml_vk_load_shaders(" << device->name << ")");
|
VK_LOG_DEBUG("ggml_vk_load_shaders(" << device->name << ")");
|
||||||
|
|
||||||
std::cerr << "ggml_vulkan: Compiling shaders";
|
|
||||||
|
|
||||||
// some shaders have a minimum subgroup size
|
// some shaders have a minimum subgroup size
|
||||||
const uint32_t subgroup_size_16 = std::max(device->subgroup_size, 16u);
|
const uint32_t subgroup_size_16 = std::max(device->subgroup_size, 16u);
|
||||||
const uint32_t subgroup_size_32 = std::max(device->subgroup_size, 32u);
|
const uint32_t subgroup_size_32 = std::max(device->subgroup_size, 32u);
|
||||||
|
@ -1531,15 +1528,33 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
device->pipeline_matmul_f32 = std::make_shared<vk_matmul_pipeline_struct>();
|
if (!device->pipeline_matmul_f32) {
|
||||||
device->pipeline_matmul_f32_f16 = std::make_shared<vk_matmul_pipeline_struct>();
|
device->pipeline_matmul_f32 = std::make_shared<vk_matmul_pipeline_struct>();
|
||||||
|
}
|
||||||
device->pipeline_matmul_id_f32 = std::make_shared<vk_matmul_pipeline_struct>();
|
if (!device->pipeline_matmul_f32_f16) {
|
||||||
|
device->pipeline_matmul_f32_f16 = std::make_shared<vk_matmul_pipeline_struct>();
|
||||||
|
}
|
||||||
|
if (!device->pipeline_matmul_id_f32) {
|
||||||
|
device->pipeline_matmul_id_f32 = std::make_shared<vk_matmul_pipeline_struct>();
|
||||||
|
}
|
||||||
|
|
||||||
std::vector<std::future<void>> compiles;
|
std::vector<std::future<void>> compiles;
|
||||||
auto const &ggml_vk_create_pipeline = [&](vk_device& device, vk_pipeline& pipeline, const std::string &name, size_t spv_size, const void* spv_data, const std::string &entrypoint,
|
auto const &ggml_vk_create_pipeline = [&](vk_device& device, vk_pipeline& pipeline, const std::string &name, size_t spv_size, const void* spv_data, const std::string &entrypoint,
|
||||||
uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, const std::vector<uint32_t>& specialization_constants,
|
uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, const std::vector<uint32_t>& specialization_constants,
|
||||||
uint32_t align, bool disable_robustness = false, bool require_full_subgroups = false, uint32_t required_subgroup_size = 0) {
|
uint32_t align, bool disable_robustness = false, bool require_full_subgroups = false, uint32_t required_subgroup_size = 0) {
|
||||||
|
|
||||||
|
if (!pipeline) {
|
||||||
|
pipeline = std::make_shared<vk_pipeline_struct>();
|
||||||
|
pipeline->name = name;
|
||||||
|
pipeline->parameter_count = parameter_count;
|
||||||
|
pipeline->push_constant_size = push_constant_size;
|
||||||
|
pipeline->wg_denoms = wg_denoms;
|
||||||
|
pipeline->align = align;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!pipeline->needed || pipeline->compiled) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
{
|
{
|
||||||
// wait until fewer than N compiles are in progress
|
// wait until fewer than N compiles are in progress
|
||||||
uint32_t N = std::max(1u, std::thread::hardware_concurrency());
|
uint32_t N = std::max(1u, std::thread::hardware_concurrency());
|
||||||
|
@ -2054,7 +2069,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||||
for (auto &c : compiles) {
|
for (auto &c : compiles) {
|
||||||
c.wait();
|
c.wait();
|
||||||
}
|
}
|
||||||
std::cerr << "Done!" << std::endl;
|
device->need_compiles = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props);
|
static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props);
|
||||||
|
@ -7664,6 +7679,9 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
||||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||||
ggml_vk_build_graph(ctx, cgraph->nodes[i], i, nullptr, 0, true, false, false);
|
ggml_vk_build_graph(ctx, cgraph->nodes[i], i, nullptr, 0, true, false, false);
|
||||||
}
|
}
|
||||||
|
if (ctx->device->need_compiles) {
|
||||||
|
ggml_vk_load_shaders(ctx->device);
|
||||||
|
}
|
||||||
ggml_vk_preallocate_buffers(ctx);
|
ggml_vk_preallocate_buffers(ctx);
|
||||||
ggml_pipeline_allocate_descriptor_sets(ctx->device);
|
ggml_pipeline_allocate_descriptor_sets(ctx->device);
|
||||||
|
|
||||||
|
|
|
@ -1314,10 +1314,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||||
const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1);
|
const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1);
|
||||||
auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
|
auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
|
||||||
if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) {
|
if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) {
|
||||||
|
LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s\n", il, ggml_backend_dev_name(cpu_dev));
|
||||||
return {cpu_dev, &pimpl->cpu_buft_list};
|
return {cpu_dev, &pimpl->cpu_buft_list};
|
||||||
}
|
}
|
||||||
const int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + n_devices(), float(il - i_gpu_start)/act_gpu_layers) - splits.begin();
|
const int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + n_devices(), float(il - i_gpu_start)/act_gpu_layers) - splits.begin();
|
||||||
auto * dev = devices.at(layer_gpu);
|
auto * dev = devices.at(layer_gpu);
|
||||||
|
LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s\n", il, ggml_backend_dev_name(dev));
|
||||||
return {dev, &pimpl->gpu_buft_list.at(dev)};
|
return {dev, &pimpl->gpu_buft_list.at(dev)};
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -9446,6 +9446,7 @@ static struct llama_model * llama_model_load_from_file_impl(
|
||||||
model->devices.push_back(*dev);
|
model->devices.push_back(*dev);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
std::vector<ggml_backend_dev_t> rpc_servers;
|
||||||
// use all available devices
|
// use all available devices
|
||||||
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
|
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
|
||||||
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
|
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
|
||||||
|
@ -9456,10 +9457,19 @@ static struct llama_model * llama_model_load_from_file_impl(
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case GGML_BACKEND_DEVICE_TYPE_GPU:
|
case GGML_BACKEND_DEVICE_TYPE_GPU:
|
||||||
model->devices.push_back(dev);
|
ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
|
||||||
|
if (ggml_backend_reg_name(reg) == std::string("RPC")) {
|
||||||
|
rpc_servers.push_back(dev);
|
||||||
|
} else {
|
||||||
|
model->devices.push_back(dev);
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// add RPC servers at the front of the list
|
||||||
|
if (!rpc_servers.empty()) {
|
||||||
|
model->devices.insert(model->devices.begin(), rpc_servers.begin(), rpc_servers.end());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// if using single GPU mode, remove all except the main GPU
|
// if using single GPU mode, remove all except the main GPU
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue