Merge commit 'acd38efee3' into concedo_experimental

# Conflicts:
#	.devops/cpu.Dockerfile
#	.devops/vulkan.Dockerfile
#	.github/workflows/build.yml
#	.github/workflows/docker.yml
#	CMakeLists.txt
#	README.md
#	cmake/llama-config.cmake.in
#	examples/simple-cmake-pkg/.gitignore
#	ggml/CMakeLists.txt
#	ggml/src/CMakeLists.txt
#	ggml/src/ggml-hip/CMakeLists.txt
This commit is contained in:
Concedo 2025-01-28 18:16:44 +08:00
commit c5d4e07664
11 changed files with 395 additions and 61 deletions

View file

@ -73,6 +73,8 @@ add_compile_definitions(GGML_USE_CPU_AARCH64)
if (MSVC) if (MSVC)
add_compile_options("$<$<COMPILE_LANGUAGE:C>:/utf-8>") add_compile_options("$<$<COMPILE_LANGUAGE:C>:/utf-8>")
add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/utf-8>") add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/utf-8>")
add_compile_options("$<$<COMPILE_LANGUAGE:C>:/bigobj>")
add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/bigobj>")
endif() endif()
file(GLOB GGML_SOURCES_CUDA "ggml/src/ggml-cuda/*.cu") file(GLOB GGML_SOURCES_CUDA "ggml/src/ggml-cuda/*.cu")

View file

@ -1427,16 +1427,16 @@ struct server_queue {
int post(server_task task, bool front = false) { int post(server_task task, bool front = false) {
std::unique_lock<std::mutex> lock(mutex_tasks); std::unique_lock<std::mutex> lock(mutex_tasks);
GGML_ASSERT(task.id != -1); GGML_ASSERT(task.id != -1);
// if this is cancel task make sure to clean up pending tasks
if (task.type == SERVER_TASK_TYPE_CANCEL) {
cleanup_pending_task(task.id_target);
}
QUE_DBG("new task, id = %d, front = %d\n", task.id, front); QUE_DBG("new task, id = %d, front = %d\n", task.id, front);
if (front) { if (front) {
queue_tasks.push_front(std::move(task)); queue_tasks.push_front(std::move(task));
} else { } else {
queue_tasks.push_back(std::move(task)); queue_tasks.push_back(std::move(task));
} }
// if this is cancel task make sure to clean up pending tasks
if (task.type == SERVER_TASK_TYPE_CANCEL) {
cleanup_pending_task(task.id_target);
}
condition_tasks.notify_one(); condition_tasks.notify_one();
return task.id; return task.id;
} }
@ -1448,16 +1448,16 @@ struct server_queue {
if (task.id == -1) { if (task.id == -1) {
task.id = id++; task.id = id++;
} }
// if this is cancel task make sure to clean up pending tasks
if (task.type == SERVER_TASK_TYPE_CANCEL) {
cleanup_pending_task(task.id_target);
}
QUE_DBG("new task, id = %d/%d, front = %d\n", task.id, (int) tasks.size(), front); QUE_DBG("new task, id = %d/%d, front = %d\n", task.id, (int) tasks.size(), front);
if (front) { if (front) {
queue_tasks.push_front(std::move(task)); queue_tasks.push_front(std::move(task));
} else { } else {
queue_tasks.push_back(std::move(task)); queue_tasks.push_back(std::move(task));
} }
// if this is cancel task make sure to clean up pending tasks
if (task.type == SERVER_TASK_TYPE_CANCEL) {
cleanup_pending_task(task.id_target);
}
} }
condition_tasks.notify_one(); condition_tasks.notify_one();
return 0; return 0;
@ -1554,10 +1554,10 @@ struct server_queue {
} }
private: private:
void cleanup_pending_task(int id_task) { void cleanup_pending_task(int id_target) {
// no need lock because this is called exclusively by post() // no need lock because this is called exclusively by post()
auto rm_func = [id_task](const server_task & task) { auto rm_func = [id_target](const server_task & task) {
return task.id_target == id_task; return task.id_target == id_target;
}; };
queue_tasks.erase( queue_tasks.erase(
std::remove_if(queue_tasks.begin(), queue_tasks.end(), rm_func), std::remove_if(queue_tasks.begin(), queue_tasks.end(), rm_func),

View file

@ -0,0 +1,11 @@
cmake_minimum_required(VERSION 3.12)
project(llama-simple-cmake-pkg)
set(TARGET llama-simple-cmake-pkg)
find_package(Llama REQUIRED)
add_executable(${TARGET} ${CMAKE_CURRENT_LIST_DIR}/../simple/simple.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE llama ggml::all ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)

View file

@ -0,0 +1,34 @@
# llama.cpp/example/simple-cmake-pkg
This program builds [simple](../simple) using a relocatable CMake package. It serves as an example of using the `find_package()` CMake command to conveniently include [llama.cpp](https://github.com/ggerganov/llama.cpp) in projects which live outside of the source tree.
## Building
Because this example is "outside of the source tree", it is important to first build/install llama.cpp using CMake. An example is provided here, but please see the [llama.cpp build instructions](../..) for more detailed build instructions.
### Considerations
When hardware acceleration libraries are used (e.g. CUDA, Metal, Vulkan, etc.), the appropriate dependencies will be searched for automatically. So, for example, when finding a package
### Build llama.cpp and install to llama.cpp/inst
```sh
git clone https://github.com/ggerganov/llama.cpp
cd llama.cpp
cmake -S . -B build
cmake --build build
cmake --install build --prefix inst
### Build simple-cmake-pkg
```sh
cd examples/simple-cmake-pkg
cmake -S . -B build -DCMAKE_PREFIX_PATH=../../inst/lib/cmake
cmake --build build
```
### Run simple-cmake-pkg
```sh
./build/llama-simple-cmake-pkg -m ./models/llama-7b-v2/ggml-model-f16.gguf "Hello my name is"
```

View file

@ -0,0 +1,147 @@
@GGML_VARIABLES_EXPANDED@
@PACKAGE_INIT@
set_and_check(GGML_INCLUDE_DIR "@PACKAGE_GGML_INCLUDE_INSTALL_DIR@")
set_and_check(GGML_LIB_DIR "@PACKAGE_GGML_LIB_INSTALL_DIR@")
set_and_check(GGML_BIN_DIR "@PACKAGE_GGML_BIN_INSTALL_DIR@")
find_package(Threads REQUIRED)
find_library(GGML_LIBRARY ggml
REQUIRED
HINTS ${GGML_LIB_DIR}
NO_CMAKE_FIND_ROOT_PATH)
add_library(ggml::ggml UNKNOWN IMPORTED)
set_target_properties(ggml::ggml
PROPERTIES
IMPORTED_LOCATION "${GGML_LIBRARY}")
find_library(GGML_BASE_LIBRARY ggml-base
REQUIRED
HINTS ${GGML_LIB_DIR}
NO_CMAKE_FIND_ROOT_PATH)
add_library(ggml::ggml-base UNKNOWN IMPORTED)
set_target_properties(ggml::ggml-base
PROPERTIES
IMPORTED_LOCATION "${GGML_BASE_LIBRARY}")
if (NOT GGML_SHARED_LIB)
if (APPLE AND GGML_ACCELERATE)
find_library(ACCELERATE_FRAMEWORK Accelerate REQUIRED)
list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES ${ACCELERATE_FRAMEWORK})
endif()
if (GGML_OPENMP)
find_package(OpenMP REQUIRED)
list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
endif()
if (GGML_CPU_HBM)
find_library(memkind memkind REQUIRED)
list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES memkind)
endif()
if (GGML_BLAS)
find_package(BLAS REQUIRED)
list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES ${BLAS_LIBRARIES})
list(APPEND GGML_CPU_INTERFACE_LINK_OPTIONS ${BLAS_LINKER_FLAGS})
endif()
if (GGML_CUDA)
find_package(CUDAToolkit REQUIRED)
endif()
if (GGML_METAL)
find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
find_library(METAL_FRAMEWORK Metal REQUIRED)
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
list(APPEND GGML_METAL_INTERFACE_LINK_LIBRARIES
${FOUNDATION_LIBRARY} ${METAL_FRAMEWORK} ${METALKIT_FRAMEWORK})
endif()
if (GGML_VULKAN)
find_package(Vulkan REQUIRED)
list(APPEND GGML_VULKAN_INTERFACE_LINK_LIBRARIES Vulkan::Vulkan)
endif()
if (GGML_HIP)
find_package(hip REQUIRED)
find_package(hipblas REQUIRED)
find_package(rocblas REQUIRED)
list(APPEND GGML_HIP_INTERFACE_LINK_LIBRARIES hip::host roc::rocblas roc::hipblas)
endif()
if (GGML_SYCL)
find_package(DNNL)
if (${DNNL_FOUND} AND GGML_SYCL_TARGET STREQUAL "INTEL")
list(APPEND GGML_SYCL_INTERFACE_LINK_LIBRARIES DNNL::dnnl)
endif()
if (WIN32)
find_package(IntelSYCL REQUIRED)
find_package(MKL REQUIRED)
list(APPEND GGML_SYCL_INTERFACE_LINK_LIBRARIES IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
endif()
endif()
endif()
set(_ggml_all_targets "")
foreach(_ggml_backend ${GGML_AVAILABLE_BACKENDS})
string(REPLACE "-" "_" _ggml_backend_pfx "${_ggml_backend}")
string(TOUPPER "${_ggml_backend_pfx}" _ggml_backend_pfx)
find_library(${_ggml_backend_pfx}_LIBRARY ${_ggml_backend}
REQUIRED
HINTS ${GGML_LIB_DIR}
NO_CMAKE_FIND_ROOT_PATH)
message(STATUS "Found ${${_ggml_backend_pfx}_LIBRARY}")
add_library(ggml::${_ggml_backend} UNKNOWN IMPORTED)
set_target_properties(ggml::${_ggml_backend}
PROPERTIES
INTERFACE_INCLUDE_DIRECTORIES "${GGML_INCLUDE_DIR}"
IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
IMPORTED_LOCATION "${${_ggml_backend_pfx}_LIBRARY}"
INTERFACE_COMPILE_FEATURES c_std_90
POSITION_INDEPENDENT_CODE ON)
string(REGEX MATCH "^ggml-cpu" is_cpu_variant "${_ggml_backend}")
if(is_cpu_variant)
list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES "ggml::ggml" "ggml::ggml-base")
set_target_properties(ggml::${_ggml_backend}
PROPERTIES
INTERFACE_LINK_LIBRARIES "${GGML_CPU_INTERFACE_LINK_LIBRARIES}")
if(GGML_CPU_INTERFACE_LINK_OPTIONS)
set_target_properties(ggml::${_ggml_backend}
PROPERTIES
INTERFACE_LINK_OPTIONS "${GGML_CPU_INTERFACE_LINK_OPTIONS}")
endif()
else()
list(APPEND ${_ggml_backend_pfx}_INTERFACE_LINK_LIBRARIES "ggml::ggml" "ggml::ggml-base")
set_target_properties(ggml::${_ggml_backend}
PROPERTIES
INTERFACE_LINK_LIBRARIES "${${_ggml_backend_pfx}_INTERFACE_LINK_LIBRARIES}")
if(${_ggml_backend_pfx}_INTERFACE_LINK_OPTIONS)
set_target_properties(ggml::${_ggml_backend}
PROPERTIES
INTERFACE_LINK_OPTIONS "${${_ggml_backend_pfx}_INTERFACE_LINK_OPTIONS}")
endif()
endif()
list(APPEND _ggml_all_targets ggml::${_ggml_backend})
endforeach()
add_library(ggml::all INTERFACE IMPORTED)
set_target_properties(ggml::all
PROPERTIES
INTERFACE_LINK_LIBRARIES "${_ggml_all_targets}")
check_required_components(ggml)

View file

@ -131,6 +131,10 @@ typedef float dfloat; // dequantize float
typedef float2 dfloat2; typedef float2 dfloat2;
#endif // GGML_CUDA_F16 #endif // GGML_CUDA_F16
#if (!defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)) || (defined(GGML_USE_HIP) && !defined(GGML_HIP_NO_VMM))
#define GGML_USE_VMM
#endif // (!defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)) || (defined(GGML_USE_HIP) && !defined(GGML_HIP_NO_VMM))
#if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL #if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
#define FP16_AVAILABLE #define FP16_AVAILABLE
#endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL #endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL

View file

@ -155,7 +155,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
for (int id = 0; id < info.device_count; ++id) { for (int id = 0; id < info.device_count; ++id) {
int device_vmm = 0; int device_vmm = 0;
#if !defined(GGML_CUDA_NO_VMM) #if defined(GGML_USE_VMM)
CUdevice device; CUdevice device;
CU_CHECK(cuDeviceGet(&device, id)); CU_CHECK(cuDeviceGet(&device, id));
CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device)); CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
@ -167,7 +167,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
alloc_prop.location.id = id; alloc_prop.location.id = id;
CU_CHECK(cuMemGetAllocationGranularity(&info.devices[id].vmm_granularity, &alloc_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED)); CU_CHECK(cuMemGetAllocationGranularity(&info.devices[id].vmm_granularity, &alloc_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
} }
#endif // !defined(GGML_CUDA_NO_VMM) #endif // defined(GGML_USE_VMM)
info.devices[id].vmm = !!device_vmm; info.devices[id].vmm = !!device_vmm;
cudaDeviceProp prop; cudaDeviceProp prop;
@ -301,7 +301,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
}; };
// pool with virtual memory // pool with virtual memory
#if !defined(GGML_CUDA_NO_VMM) #if defined(GGML_USE_VMM)
struct ggml_cuda_pool_vmm : public ggml_cuda_pool { struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
@ -409,14 +409,14 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
GGML_ASSERT(ptr == (void *) ((char *)(pool_addr) + pool_used)); GGML_ASSERT(ptr == (void *) ((char *)(pool_addr) + pool_used));
} }
}; };
#endif // !defined(GGML_CUDA_NO_VMM) #endif // defined(GGML_USE_VMM)
std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device) { std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device) {
#if !defined(GGML_CUDA_NO_VMM) #if defined(GGML_USE_VMM)
if (ggml_cuda_info().devices[device].vmm) { if (ggml_cuda_info().devices[device].vmm) {
return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device)); return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device));
} }
#endif // !defined(GGML_CUDA_NO_VMM) #endif // defined(GGML_USE_VMM)
return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_leg(device)); return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_leg(device));
} }
@ -3256,7 +3256,7 @@ static ggml_backend_feature * ggml_backend_cuda_get_features(ggml_backend_reg_t
features.push_back({ "FORCE_CUBLAS", "1" }); features.push_back({ "FORCE_CUBLAS", "1" });
#endif #endif
#ifdef GGML_CUDA_NO_VMM #ifndef GGML_USE_VMM
features.push_back({ "NO_VMM", "1" }); features.push_back({ "NO_VMM", "1" });
#endif #endif

View file

@ -19,7 +19,10 @@
// max number of MTLCommandBuffer used to submit a graph for processing // max number of MTLCommandBuffer used to submit a graph for processing
#define GGML_METAL_MAX_COMMAND_BUFFERS 8 #define GGML_METAL_MAX_COMMAND_BUFFERS 8
#define UNUSED(x) (void)(x) // create residency sets only on macOS >= 15.0
#if TARGET_OS_OSX && __MAC_OS_X_VERSION_MAX_ALLOWED >= 150000
#define GGML_METAL_HAS_RESIDENCY_SETS 1
#endif
// globals // globals
@ -39,6 +42,7 @@ static struct ggml_backend_metal_device_context {
bool has_simdgroup_reduction; bool has_simdgroup_reduction;
bool has_simdgroup_mm; bool has_simdgroup_mm;
bool has_residency_sets;
bool has_bfloat; bool has_bfloat;
bool use_bfloat; bool use_bfloat;
@ -48,6 +52,7 @@ static struct ggml_backend_metal_device_context {
/*.mtl_device_ref_count =*/ 0, /*.mtl_device_ref_count =*/ 0,
/*.has_simdgroup_reduction =*/ false, /*.has_simdgroup_reduction =*/ false,
/*.has_simdgroup_mm =*/ false, /*.has_simdgroup_mm =*/ false,
/*.has_residency_sets =*/ false,
/*.has_bfloat =*/ false, /*.has_bfloat =*/ false,
/*.use_bfloat =*/ false, /*.use_bfloat =*/ false,
/*.name =*/ "", /*.name =*/ "",
@ -59,12 +64,18 @@ static id<MTLDevice> ggml_backend_metal_device_acq(struct ggml_backend_metal_dev
if (ctx->mtl_device == nil) { if (ctx->mtl_device == nil) {
ctx->mtl_device = MTLCreateSystemDefaultDevice(); ctx->mtl_device = MTLCreateSystemDefaultDevice();
}
if (ctx->mtl_device) {
ctx->has_simdgroup_reduction = [ctx->mtl_device supportsFamily:MTLGPUFamilyApple7]; ctx->has_simdgroup_reduction = [ctx->mtl_device supportsFamily:MTLGPUFamilyApple7];
ctx->has_simdgroup_reduction |= [ctx->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML]; ctx->has_simdgroup_reduction |= [ctx->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML];
ctx->has_simdgroup_mm = [ctx->mtl_device supportsFamily:MTLGPUFamilyApple7]; ctx->has_simdgroup_mm = [ctx->mtl_device supportsFamily:MTLGPUFamilyApple7];
#if defined(GGML_METAL_HAS_RESIDENCY_SETS)
ctx->has_residency_sets = getenv("GGML_METAL_NO_RESIDENCY") == NULL;
#endif
ctx->has_bfloat = [ctx->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML]; ctx->has_bfloat = [ctx->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML];
ctx->has_bfloat |= [ctx->mtl_device supportsFamily:MTLGPUFamilyApple6]; ctx->has_bfloat |= [ctx->mtl_device supportsFamily:MTLGPUFamilyApple6];
@ -90,8 +101,10 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
ctx->mtl_device_ref_count--; ctx->mtl_device_ref_count--;
if (ctx->mtl_device_ref_count == 0) { if (ctx->mtl_device_ref_count == 0) {
[ctx->mtl_device release]; if (ctx->mtl_device) {
ctx->mtl_device = nil; [ctx->mtl_device release];
ctx->mtl_device = nil;
}
} }
} }
@ -483,6 +496,11 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]); GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]);
ctx->queue = [device newCommandQueue]; ctx->queue = [device newCommandQueue];
if (ctx->queue == nil) {
GGML_LOG_ERROR("%s: error: failed to create command queue\n", __func__);
return NULL;
}
ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT); ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
id<MTLLibrary> metal_library; id<MTLLibrary> metal_library;
@ -649,6 +667,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
GGML_LOG_INFO("%s: simdgroup reduction = %s\n", __func__, ctx_dev->has_simdgroup_reduction ? "true" : "false"); GGML_LOG_INFO("%s: simdgroup reduction = %s\n", __func__, ctx_dev->has_simdgroup_reduction ? "true" : "false");
GGML_LOG_INFO("%s: simdgroup matrix mul. = %s\n", __func__, ctx_dev->has_simdgroup_mm ? "true" : "false"); GGML_LOG_INFO("%s: simdgroup matrix mul. = %s\n", __func__, ctx_dev->has_simdgroup_mm ? "true" : "false");
GGML_LOG_INFO("%s: has residency sets = %s\n", __func__, ctx_dev->has_residency_sets ? "true" : "false");
GGML_LOG_INFO("%s: has bfloat = %s\n", __func__, ctx_dev->has_bfloat ? "true" : "false"); GGML_LOG_INFO("%s: has bfloat = %s\n", __func__, ctx_dev->has_bfloat ? "true" : "false");
GGML_LOG_INFO("%s: use bfloat = %s\n", __func__, ctx_dev->use_bfloat ? "true" : "false"); GGML_LOG_INFO("%s: use bfloat = %s\n", __func__, ctx_dev->use_bfloat ? "true" : "false");
GGML_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx_dev->mtl_device.hasUnifiedMemory ? "true" : "false"); GGML_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx_dev->mtl_device.hasUnifiedMemory ? "true" : "false");
@ -1035,8 +1054,70 @@ struct ggml_backend_metal_buffer_context {
// multiple buffers are used only to avoid the maximum buffer size limitation when using mmap // multiple buffers are used only to avoid the maximum buffer size limitation when using mmap
int n_buffers; int n_buffers;
struct ggml_backend_metal_buffer buffers[GGML_METAL_MAX_BUFFERS]; struct ggml_backend_metal_buffer buffers[GGML_METAL_MAX_BUFFERS];
// optional MTLResidencySet
id rset;
}; };
// rset init
static bool ggml_backend_metal_buffer_rset_init(
struct ggml_backend_metal_buffer_context * ctx,
struct ggml_backend_metal_device_context * ctx_dev,
id<MTLDevice> device) {
ctx->rset = nil;
if (!ctx_dev->has_residency_sets) {
return true;
}
#if defined(GGML_METAL_HAS_RESIDENCY_SETS)
if (@available(macOS 15.0, *)) {
MTLResidencySetDescriptor * desc = [[MTLResidencySetDescriptor alloc] init];
desc.label = @"ggml_backend_metal";
desc.initialCapacity = ctx->n_buffers;
NSError * error;
ctx->rset = [device newResidencySetWithDescriptor:desc error:&error];
if (error) {
GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
[desc release];
return false;
}
[desc release];
for (int i = 0; i < ctx->n_buffers; i++) {
[ctx->rset addAllocation:ctx->buffers[i].metal];
}
[ctx->rset commit];
[ctx->rset requestResidency];
return true;
}
#else
GGML_UNUSED(ctx_dev);
GGML_UNUSED(device);
#endif
return true;
}
// rset free
static void ggml_backend_metal_buffer_rset_free(struct ggml_backend_metal_buffer_context * ctx) {
#if defined(GGML_METAL_HAS_RESIDENCY_SETS)
if (@available(macOS 15.0, *)) {
if (ctx->rset) {
[ctx->rset endResidency];
[ctx->rset removeAllAllocations];
[ctx->rset release];
}
}
#else
GGML_UNUSED(ctx);
#endif
}
// finds the Metal buffer that contains the tensor data on the GPU device // finds the Metal buffer that contains the tensor data on the GPU device
// the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the // the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
// Metal buffer based on the host memory pointer // Metal buffer based on the host memory pointer
@ -4176,6 +4257,8 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
for (int i = 0; i < ctx->n_buffers; i++) { for (int i = 0; i < ctx->n_buffers; i++) {
[ctx->buffers[i].metal release]; [ctx->buffers[i].metal release];
} }
ggml_backend_metal_buffer_rset_free(ctx);
ggml_backend_metal_device_rel(buffer->buft->device->context); ggml_backend_metal_device_rel(buffer->buft->device->context);
if (ctx->owned) { if (ctx->owned) {
@ -4198,19 +4281,19 @@ static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
static void ggml_backend_metal_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { static void ggml_backend_metal_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
memset((char *)tensor->data + offset, value, size); memset((char *)tensor->data + offset, value, size);
UNUSED(buffer); GGML_UNUSED(buffer);
} }
static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
memcpy((char *)tensor->data + offset, data, size); memcpy((char *)tensor->data + offset, data, size);
UNUSED(buffer); GGML_UNUSED(buffer);
} }
static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
memcpy(data, (const char *)tensor->data + offset, size); memcpy(data, (const char *)tensor->data + offset, size);
UNUSED(buffer); GGML_UNUSED(buffer);
} }
static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) { static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
@ -4220,7 +4303,7 @@ static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, c
} }
return false; return false;
UNUSED(buffer); GGML_UNUSED(buffer);
} }
static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
@ -4246,7 +4329,7 @@ static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = {
static const char * ggml_backend_metal_buffer_type_get_name(ggml_backend_buffer_type_t buft) { static const char * ggml_backend_metal_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
return "Metal"; return "Metal";
UNUSED(buft); GGML_UNUSED(buft);
} }
static void ggml_backend_metal_log_allocated_size(id<MTLDevice> device, size_t size_aligned) { static void ggml_backend_metal_log_allocated_size(id<MTLDevice> device, size_t size_aligned) {
@ -4270,8 +4353,8 @@ static void ggml_backend_metal_log_allocated_size(id<MTLDevice> device, size_t s
} }
#endif #endif
#endif #endif
UNUSED(device); GGML_UNUSED(device);
UNUSED(size_aligned); GGML_UNUSED(size_aligned);
} }
static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
@ -4284,7 +4367,8 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
size_aligned += (size_page - (size_aligned % size_page)); size_aligned += (size_page - (size_aligned % size_page));
} }
id<MTLDevice> device = ggml_backend_metal_device_acq(buft->device->context); struct ggml_backend_metal_device_context * ctx_dev = (struct ggml_backend_metal_device_context *)buft->device->context;
id<MTLDevice> device = ggml_backend_metal_device_acq(ctx_dev);
ctx->all_data = ggml_metal_host_malloc(size_aligned); ctx->all_data = ggml_metal_host_malloc(size_aligned);
ctx->all_size = size_aligned; ctx->all_size = size_aligned;
@ -4307,7 +4391,14 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
if (size_aligned > 0 && (ctx->all_data == NULL || ctx->buffers[0].metal == nil)) { if (size_aligned > 0 && (ctx->all_data == NULL || ctx->buffers[0].metal == nil)) {
GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0); GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
free(ctx); free(ctx);
ggml_backend_metal_device_rel(buft->device->context); ggml_backend_metal_device_rel(ctx_dev);
return NULL;
}
if (!ggml_backend_metal_buffer_rset_init(ctx, ctx_dev, device)) {
GGML_LOG_ERROR("%s: error: failed to initialize residency set\n", __func__);
free(ctx);
ggml_backend_metal_device_rel(ctx_dev);
return NULL; return NULL;
} }
@ -4318,7 +4409,7 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
return 32; return 32;
UNUSED(buft); GGML_UNUSED(buft);
} }
static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
@ -4328,13 +4419,13 @@ static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend_buffer_ty
return max_size; return max_size;
UNUSED(buft); GGML_UNUSED(buft);
} }
static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) { static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
return true; return true;
UNUSED(buft); GGML_UNUSED(buft);
} }
ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) { ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
@ -4357,7 +4448,7 @@ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
static const char * ggml_backend_metal_buffer_from_ptr_type_get_name(ggml_backend_buffer_type_t buft) { static const char * ggml_backend_metal_buffer_from_ptr_type_get_name(ggml_backend_buffer_type_t buft) {
return "Metal_Mapped"; return "Metal_Mapped";
UNUSED(buft); GGML_UNUSED(buft);
} }
static ggml_backend_buffer_type_t ggml_backend_metal_buffer_from_ptr_type(void) { static ggml_backend_buffer_type_t ggml_backend_metal_buffer_from_ptr_type(void) {
@ -4400,7 +4491,8 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
size_aligned += (size_page - (size_aligned % size_page)); size_aligned += (size_page - (size_aligned % size_page));
} }
id<MTLDevice> device = ggml_backend_metal_device_acq(&g_ggml_ctx_dev_main); struct ggml_backend_metal_device_context * ctx_dev = &g_ggml_ctx_dev_main;
id<MTLDevice> device = ggml_backend_metal_device_acq(ctx_dev);
// the buffer fits into the max buffer size allowed by the device // the buffer fits into the max buffer size allowed by the device
if (size_aligned <= device.maxBufferLength) { if (size_aligned <= device.maxBufferLength) {
@ -4453,6 +4545,13 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
} }
} }
if (!ggml_backend_metal_buffer_rset_init(ctx, ctx_dev, device)) {
GGML_LOG_ERROR("%s: error: failed to initialize residency set\n", __func__);
free(ctx);
ggml_backend_metal_device_rel(ctx_dev);
return NULL;
}
return ggml_backend_buffer_init(ggml_backend_metal_buffer_from_ptr_type(), ggml_backend_metal_buffer_i, ctx, size); return ggml_backend_buffer_init(ggml_backend_metal_buffer_from_ptr_type(), ggml_backend_metal_buffer_i, ctx, size);
} }
@ -4461,7 +4560,7 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
static const char * ggml_backend_metal_name(ggml_backend_t backend) { static const char * ggml_backend_metal_name(ggml_backend_t backend) {
return "Metal"; return "Metal";
UNUSED(backend); GGML_UNUSED(backend);
} }
static void ggml_backend_metal_free(ggml_backend_t backend) { static void ggml_backend_metal_free(ggml_backend_t backend) {
@ -4766,6 +4865,13 @@ static ggml_backend_buffer_t ggml_backend_metal_device_buffer_from_ptr(ggml_back
} }
} }
if (!ggml_backend_metal_buffer_rset_init(ctx, ctx_dev, device)) {
GGML_LOG_ERROR("%s: error: failed to initialize residency set\n", __func__);
free(ctx);
ggml_backend_metal_device_rel(ctx_dev);
return NULL;
}
return ggml_backend_buffer_init(ggml_backend_metal_buffer_from_ptr_type(), ggml_backend_metal_buffer_i, ctx, size); return ggml_backend_buffer_init(ggml_backend_metal_buffer_from_ptr_type(), ggml_backend_metal_buffer_i, ctx, size);
} }
@ -4779,7 +4885,7 @@ static bool ggml_backend_metal_device_supports_buft(ggml_backend_dev_t dev, ggml
return buft->iface.get_name == ggml_backend_metal_buffer_type_get_name || return buft->iface.get_name == ggml_backend_metal_buffer_type_get_name ||
buft->iface.get_name == ggml_backend_metal_buffer_from_ptr_type_get_name; buft->iface.get_name == ggml_backend_metal_buffer_from_ptr_type_get_name;
UNUSED(dev); GGML_UNUSED(dev);
} }
static bool ggml_backend_metal_device_offload_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) { static bool ggml_backend_metal_device_offload_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {

View file

@ -89,6 +89,10 @@ struct vk_pipeline_struct {
uint32_t parameter_count; uint32_t parameter_count;
std::array<uint32_t, 3> wg_denoms; std::array<uint32_t, 3> wg_denoms;
uint32_t align; uint32_t align;
// set to true to request the pipeline is compiled after the dryrun
bool needed {};
// set to true when the shader has been compiled
bool compiled {};
}; };
typedef std::shared_ptr<vk_pipeline_struct> vk_pipeline; typedef std::shared_ptr<vk_pipeline_struct> vk_pipeline;
@ -190,8 +194,11 @@ struct vk_device_struct {
bool mul_mat_id_m; bool mul_mat_id_m;
bool mul_mat_id_s; bool mul_mat_id_s;
vk_matmul_pipeline pipeline_matmul_f32; // set to true to indicate that some shaders need to be compiled after the dryrun
vk_matmul_pipeline pipeline_matmul_f32_f16; bool need_compiles {};
vk_matmul_pipeline pipeline_matmul_f32 {};
vk_matmul_pipeline pipeline_matmul_f32_f16 {};
vk_matmul_pipeline2 pipeline_matmul_f16; vk_matmul_pipeline2 pipeline_matmul_f16;
vk_matmul_pipeline2 pipeline_matmul_f16_f32; vk_matmul_pipeline2 pipeline_matmul_f16_f32;
vk_pipeline pipeline_matmul_split_k_reduce; vk_pipeline pipeline_matmul_split_k_reduce;
@ -199,7 +206,7 @@ struct vk_device_struct {
vk_matmul_pipeline2 pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_COUNT]; vk_matmul_pipeline2 pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_COUNT];
vk_matmul_pipeline2 pipeline_dequant_mul_mat_mat[GGML_TYPE_COUNT]; vk_matmul_pipeline2 pipeline_dequant_mul_mat_mat[GGML_TYPE_COUNT];
vk_matmul_pipeline pipeline_matmul_id_f32; vk_matmul_pipeline pipeline_matmul_id_f32 {};
vk_matmul_pipeline2 pipeline_matmul_id_f16; vk_matmul_pipeline2 pipeline_matmul_id_f16;
vk_matmul_pipeline2 pipeline_matmul_id_f16_f32; vk_matmul_pipeline2 pipeline_matmul_id_f16_f32;
@ -780,13 +787,6 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
GGML_ASSERT(parameter_count > 0); GGML_ASSERT(parameter_count > 0);
GGML_ASSERT(wg_denoms[0] > 0 && wg_denoms[1] > 0 && wg_denoms[2] > 0); // NOLINT GGML_ASSERT(wg_denoms[0] > 0 && wg_denoms[1] > 0 && wg_denoms[2] > 0); // NOLINT
pipeline = std::make_shared<vk_pipeline_struct>();
pipeline->name = name;
pipeline->parameter_count = parameter_count;
pipeline->push_constant_size = push_constant_size;
pipeline->wg_denoms = wg_denoms;
pipeline->align = align;
vk::ShaderModuleCreateInfo shader_module_create_info({}, spv_size, reinterpret_cast<const uint32_t *>(spv_data)); vk::ShaderModuleCreateInfo shader_module_create_info({}, spv_size, reinterpret_cast<const uint32_t *>(spv_data));
pipeline->shader_module = device->device.createShaderModule(shader_module_create_info); pipeline->shader_module = device->device.createShaderModule(shader_module_create_info);
@ -869,6 +869,7 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
} }
pipeline->pipeline = device->device.createComputePipeline(VK_NULL_HANDLE, compute_pipeline_create_info).value; pipeline->pipeline = device->device.createComputePipeline(VK_NULL_HANDLE, compute_pipeline_create_info).value;
pipeline->compiled = true;
{ {
std::lock_guard<std::mutex> guard(device->mutex); std::lock_guard<std::mutex> guard(device->mutex);
@ -879,12 +880,6 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
std::lock_guard<std::mutex> guard(compile_count_mutex); std::lock_guard<std::mutex> guard(compile_count_mutex);
assert(compile_count > 0); assert(compile_count > 0);
compile_count--; compile_count--;
// "Progress bar" for shader compiles
static uint32_t total_compile_count = 0;
if ((total_compile_count++ % 10) == 0) {
std::cerr << ".";
}
} }
compile_count_cond.notify_all(); compile_count_cond.notify_all();
} }
@ -910,6 +905,10 @@ static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline)
static void ggml_pipeline_request_descriptor_sets(vk_device& device, vk_pipeline& pipeline, uint32_t n) { static void ggml_pipeline_request_descriptor_sets(vk_device& device, vk_pipeline& pipeline, uint32_t n) {
VK_LOG_DEBUG("ggml_pipeline_request_descriptor_sets(" << pipeline->name << ", " << n << ")"); VK_LOG_DEBUG("ggml_pipeline_request_descriptor_sets(" << pipeline->name << ", " << n << ")");
device->pipeline_descriptor_set_requirements[pipeline->name] += n; device->pipeline_descriptor_set_requirements[pipeline->name] += n;
if (!pipeline->compiled) {
pipeline->needed = true;
device->need_compiles = true;
}
} }
static void ggml_pipeline_allocate_descriptor_sets(vk_device& device) { static void ggml_pipeline_allocate_descriptor_sets(vk_device& device) {
@ -1392,8 +1391,6 @@ static bool ggml_vk_matmul_shmem_support(const vk_device& device, const std::vec
static void ggml_vk_load_shaders(vk_device& device) { static void ggml_vk_load_shaders(vk_device& device) {
VK_LOG_DEBUG("ggml_vk_load_shaders(" << device->name << ")"); VK_LOG_DEBUG("ggml_vk_load_shaders(" << device->name << ")");
std::cerr << "ggml_vulkan: Compiling shaders";
// some shaders have a minimum subgroup size // some shaders have a minimum subgroup size
const uint32_t subgroup_size_16 = std::max(device->subgroup_size, 16u); const uint32_t subgroup_size_16 = std::max(device->subgroup_size, 16u);
const uint32_t subgroup_size_32 = std::max(device->subgroup_size, 32u); const uint32_t subgroup_size_32 = std::max(device->subgroup_size, 32u);
@ -1531,15 +1528,33 @@ static void ggml_vk_load_shaders(vk_device& device) {
} }
} }
device->pipeline_matmul_f32 = std::make_shared<vk_matmul_pipeline_struct>(); if (!device->pipeline_matmul_f32) {
device->pipeline_matmul_f32_f16 = std::make_shared<vk_matmul_pipeline_struct>(); device->pipeline_matmul_f32 = std::make_shared<vk_matmul_pipeline_struct>();
}
device->pipeline_matmul_id_f32 = std::make_shared<vk_matmul_pipeline_struct>(); if (!device->pipeline_matmul_f32_f16) {
device->pipeline_matmul_f32_f16 = std::make_shared<vk_matmul_pipeline_struct>();
}
if (!device->pipeline_matmul_id_f32) {
device->pipeline_matmul_id_f32 = std::make_shared<vk_matmul_pipeline_struct>();
}
std::vector<std::future<void>> compiles; std::vector<std::future<void>> compiles;
auto const &ggml_vk_create_pipeline = [&](vk_device& device, vk_pipeline& pipeline, const std::string &name, size_t spv_size, const void* spv_data, const std::string &entrypoint, auto const &ggml_vk_create_pipeline = [&](vk_device& device, vk_pipeline& pipeline, const std::string &name, size_t spv_size, const void* spv_data, const std::string &entrypoint,
uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, const std::vector<uint32_t>& specialization_constants, uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, const std::vector<uint32_t>& specialization_constants,
uint32_t align, bool disable_robustness = false, bool require_full_subgroups = false, uint32_t required_subgroup_size = 0) { uint32_t align, bool disable_robustness = false, bool require_full_subgroups = false, uint32_t required_subgroup_size = 0) {
if (!pipeline) {
pipeline = std::make_shared<vk_pipeline_struct>();
pipeline->name = name;
pipeline->parameter_count = parameter_count;
pipeline->push_constant_size = push_constant_size;
pipeline->wg_denoms = wg_denoms;
pipeline->align = align;
}
if (!pipeline->needed || pipeline->compiled) {
return;
}
{ {
// wait until fewer than N compiles are in progress // wait until fewer than N compiles are in progress
uint32_t N = std::max(1u, std::thread::hardware_concurrency()); uint32_t N = std::max(1u, std::thread::hardware_concurrency());
@ -2054,7 +2069,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
for (auto &c : compiles) { for (auto &c : compiles) {
c.wait(); c.wait();
} }
std::cerr << "Done!" << std::endl; device->need_compiles = false;
} }
static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props); static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props);
@ -7664,6 +7679,9 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
for (int i = 0; i < cgraph->n_nodes; i++) { for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_vk_build_graph(ctx, cgraph->nodes[i], i, nullptr, 0, true, false, false); ggml_vk_build_graph(ctx, cgraph->nodes[i], i, nullptr, 0, true, false, false);
} }
if (ctx->device->need_compiles) {
ggml_vk_load_shaders(ctx->device);
}
ggml_vk_preallocate_buffers(ctx); ggml_vk_preallocate_buffers(ctx);
ggml_pipeline_allocate_descriptor_sets(ctx->device); ggml_pipeline_allocate_descriptor_sets(ctx->device);

View file

@ -1314,10 +1314,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1); const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1);
auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev { auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) { if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) {
LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s\n", il, ggml_backend_dev_name(cpu_dev));
return {cpu_dev, &pimpl->cpu_buft_list}; return {cpu_dev, &pimpl->cpu_buft_list};
} }
const int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + n_devices(), float(il - i_gpu_start)/act_gpu_layers) - splits.begin(); const int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + n_devices(), float(il - i_gpu_start)/act_gpu_layers) - splits.begin();
auto * dev = devices.at(layer_gpu); auto * dev = devices.at(layer_gpu);
LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s\n", il, ggml_backend_dev_name(dev));
return {dev, &pimpl->gpu_buft_list.at(dev)}; return {dev, &pimpl->gpu_buft_list.at(dev)};
}; };

View file

@ -9446,6 +9446,7 @@ static struct llama_model * llama_model_load_from_file_impl(
model->devices.push_back(*dev); model->devices.push_back(*dev);
} }
} else { } else {
std::vector<ggml_backend_dev_t> rpc_servers;
// use all available devices // use all available devices
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
ggml_backend_dev_t dev = ggml_backend_dev_get(i); ggml_backend_dev_t dev = ggml_backend_dev_get(i);
@ -9456,10 +9457,19 @@ static struct llama_model * llama_model_load_from_file_impl(
break; break;
case GGML_BACKEND_DEVICE_TYPE_GPU: case GGML_BACKEND_DEVICE_TYPE_GPU:
model->devices.push_back(dev); ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
if (ggml_backend_reg_name(reg) == std::string("RPC")) {
rpc_servers.push_back(dev);
} else {
model->devices.push_back(dev);
}
break; break;
} }
} }
// add RPC servers at the front of the list
if (!rpc_servers.empty()) {
model->devices.insert(model->devices.begin(), rpc_servers.begin(), rpc_servers.end());
}
} }
// if using single GPU mode, remove all except the main GPU // if using single GPU mode, remove all except the main GPU