mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-10 17:14:36 +00:00
vulkan implementation from occam (early access, squashed)
This commit is contained in:
parent
ddc5a5033f
commit
7b3866f211
18 changed files with 69301 additions and 34 deletions
|
@ -98,6 +98,7 @@ set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
|
||||||
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
|
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
|
||||||
option(LLAMA_HIP_UMA "llama: use HIP unified memory architecture" OFF)
|
option(LLAMA_HIP_UMA "llama: use HIP unified memory architecture" OFF)
|
||||||
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
|
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
|
||||||
|
option(LLAMA_VULKAN "llama: use Vulkan" OFF)
|
||||||
option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT})
|
option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT})
|
||||||
option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging" OFF)
|
option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging" OFF)
|
||||||
option(LLAMA_METAL_SHADER_DEBUG "llama: compile Metal with -fno-fast-math" OFF)
|
option(LLAMA_METAL_SHADER_DEBUG "llama: compile Metal with -fno-fast-math" OFF)
|
||||||
|
@ -409,6 +410,22 @@ if (LLAMA_CLBLAST)
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if (LLAMA_VULKAN)
|
||||||
|
find_package(Vulkan)
|
||||||
|
if (Vulkan_FOUND)
|
||||||
|
message(STATUS "Vulkan found")
|
||||||
|
|
||||||
|
add_library(ggml-vulkan STATIC ggml-vulkan.cpp ggml-vulkan.h)
|
||||||
|
target_link_libraries(ggml-vulkan PUBLIC Vulkan::Vulkan)
|
||||||
|
|
||||||
|
add_compile_definitions(GGML_USE_VULKAN)
|
||||||
|
|
||||||
|
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ggml-vulkan)
|
||||||
|
else()
|
||||||
|
message(WARNING "Vulkan not found")
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
if (LLAMA_HIPBLAS)
|
if (LLAMA_HIPBLAS)
|
||||||
list(APPEND CMAKE_PREFIX_PATH /opt/rocm)
|
list(APPEND CMAKE_PREFIX_PATH /opt/rocm)
|
||||||
|
|
||||||
|
|
15
Makefile
15
Makefile
|
@ -448,6 +448,21 @@ ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
endif # LLAMA_CLBLAST
|
endif # LLAMA_CLBLAST
|
||||||
|
|
||||||
|
ifdef LLAMA_VULKAN
|
||||||
|
CFLAGS += -DGGML_USE_VULKAN
|
||||||
|
CXXFLAGS += -DGGML_USE_VULKAN
|
||||||
|
LDFLAGS += -lvulkan
|
||||||
|
OBJS += ggml-vulkan.o
|
||||||
|
|
||||||
|
ifdef LLAMA_VULKAN_CHECK_RESULTS
|
||||||
|
CFLAGS += -DGGML_VULKAN_CHECK_RESULTS
|
||||||
|
CXXFLAGS += -DGGML_VULKAN_CHECK_RESULTS
|
||||||
|
endif
|
||||||
|
|
||||||
|
ggml-vulkan.o: ggml-vulkan.cpp ggml-vulkan.h
|
||||||
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
endif # LLAMA_VULKAN
|
||||||
|
|
||||||
ifdef LLAMA_HIPBLAS
|
ifdef LLAMA_HIPBLAS
|
||||||
|
|
||||||
ifeq ($(wildcard /opt/rocm),)
|
ifeq ($(wildcard /opt/rocm),)
|
||||||
|
|
|
@ -562,6 +562,7 @@ struct test {
|
||||||
static const int build_number;
|
static const int build_number;
|
||||||
static const bool cuda;
|
static const bool cuda;
|
||||||
static const bool opencl;
|
static const bool opencl;
|
||||||
|
static const bool vulkan;
|
||||||
static const bool metal;
|
static const bool metal;
|
||||||
static const bool gpu_blas;
|
static const bool gpu_blas;
|
||||||
static const bool blas;
|
static const bool blas;
|
||||||
|
@ -643,6 +644,9 @@ struct test {
|
||||||
if (opencl) {
|
if (opencl) {
|
||||||
return "OpenCL";
|
return "OpenCL";
|
||||||
}
|
}
|
||||||
|
if (vulkan) {
|
||||||
|
return "Vulkan";
|
||||||
|
}
|
||||||
if (metal) {
|
if (metal) {
|
||||||
return "Metal";
|
return "Metal";
|
||||||
}
|
}
|
||||||
|
@ -658,7 +662,7 @@ struct test {
|
||||||
static const std::vector<std::string> & get_fields() {
|
static const std::vector<std::string> & get_fields() {
|
||||||
static const std::vector<std::string> fields = {
|
static const std::vector<std::string> fields = {
|
||||||
"build_commit", "build_number",
|
"build_commit", "build_number",
|
||||||
"cuda", "opencl", "metal", "gpu_blas", "blas",
|
"cuda", "opencl", "vulkan", "metal", "gpu_blas", "blas",
|
||||||
"cpu_info", "gpu_info",
|
"cpu_info", "gpu_info",
|
||||||
"model_filename", "model_type", "model_size", "model_n_params",
|
"model_filename", "model_type", "model_size", "model_n_params",
|
||||||
"n_batch", "n_threads", "type_k", "type_v",
|
"n_batch", "n_threads", "type_k", "type_v",
|
||||||
|
@ -682,7 +686,7 @@ struct test {
|
||||||
field == "avg_ns" || field == "stddev_ns") {
|
field == "avg_ns" || field == "stddev_ns") {
|
||||||
return INT;
|
return INT;
|
||||||
}
|
}
|
||||||
if (field == "cuda" || field == "opencl" || field == "metal" || field == "gpu_blas" || field == "blas" ||
|
if (field == "cuda" || field == "opencl" || field == "vulkan"|| field == "metal" || field == "gpu_blas" || field == "blas" ||
|
||||||
field == "f16_kv" || field == "no_kv_offload" || field == "mul_mat_q") {
|
field == "f16_kv" || field == "no_kv_offload" || field == "mul_mat_q") {
|
||||||
return BOOL;
|
return BOOL;
|
||||||
}
|
}
|
||||||
|
@ -710,7 +714,7 @@ struct test {
|
||||||
}
|
}
|
||||||
std::vector<std::string> values = {
|
std::vector<std::string> values = {
|
||||||
build_commit, std::to_string(build_number),
|
build_commit, std::to_string(build_number),
|
||||||
std::to_string(cuda), std::to_string(opencl), std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas),
|
std::to_string(cuda), std::to_string(opencl), std::to_string(vulkan), std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas),
|
||||||
cpu_info, gpu_info,
|
cpu_info, gpu_info,
|
||||||
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
|
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
|
||||||
std::to_string(n_batch), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
|
std::to_string(n_batch), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
|
||||||
|
@ -738,6 +742,7 @@ const std::string test::build_commit = LLAMA_COMMIT;
|
||||||
const int test::build_number = LLAMA_BUILD_NUMBER;
|
const int test::build_number = LLAMA_BUILD_NUMBER;
|
||||||
const bool test::cuda = !!ggml_cpu_has_cublas();
|
const bool test::cuda = !!ggml_cpu_has_cublas();
|
||||||
const bool test::opencl = !!ggml_cpu_has_clblast();
|
const bool test::opencl = !!ggml_cpu_has_clblast();
|
||||||
|
const bool test::vulkan = !!ggml_cpu_has_vulkan();
|
||||||
const bool test::metal = !!ggml_cpu_has_metal();
|
const bool test::metal = !!ggml_cpu_has_metal();
|
||||||
const bool test::gpu_blas = !!ggml_cpu_has_gpublas();
|
const bool test::gpu_blas = !!ggml_cpu_has_gpublas();
|
||||||
const bool test::blas = !!ggml_cpu_has_blas();
|
const bool test::blas = !!ggml_cpu_has_blas();
|
||||||
|
|
106
ggml-alloc.c
106
ggml-alloc.c
|
@ -776,38 +776,26 @@ size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph * graph)
|
||||||
}
|
}
|
||||||
|
|
||||||
// utils
|
// utils
|
||||||
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
|
|
||||||
GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
|
|
||||||
|
|
||||||
size_t alignment = ggml_backend_buft_get_alignment(buft);
|
static bool alloc_tensor_range(struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * first, struct ggml_tensor * last,
|
||||||
size_t nbytes = 0;
|
ggml_backend_buffer_type_t buft, size_t size,
|
||||||
for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
ggml_backend_buffer_t ** buffers, size_t * n_buffers) {
|
||||||
if (t->data == NULL && t->view_src == NULL) {
|
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
|
||||||
nbytes += GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (nbytes == 0) {
|
|
||||||
// all the tensors in the context are already allocated
|
|
||||||
#ifndef NDEBUG
|
|
||||||
fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
|
|
||||||
#endif
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, nbytes);
|
|
||||||
if (buffer == NULL) {
|
if (buffer == NULL) {
|
||||||
// failed to allocate buffer
|
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
fprintf(stderr, "%s: failed to allocate buffer\n", __func__);
|
fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
|
||||||
#endif
|
#endif
|
||||||
return NULL;
|
for (size_t i = 0; i < *n_buffers; i++) {
|
||||||
|
ggml_backend_buffer_free(*buffers[i]);
|
||||||
|
}
|
||||||
|
free(buffers);
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_tallocr_t tallocr = ggml_tallocr_new_from_buffer(buffer);
|
ggml_tallocr_t tallocr = ggml_tallocr_new_from_buffer(buffer);
|
||||||
|
|
||||||
for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
|
||||||
if (t->data == NULL) {
|
if (t->data == NULL) {
|
||||||
if (t->view_src == NULL) {
|
if (t->view_src == NULL) {
|
||||||
ggml_tallocr_alloc(tallocr, t);
|
ggml_tallocr_alloc(tallocr, t);
|
||||||
|
@ -824,6 +812,76 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
|
||||||
|
|
||||||
ggml_tallocr_free(tallocr);
|
ggml_tallocr_free(tallocr);
|
||||||
|
|
||||||
|
*buffers = realloc(*buffers, sizeof(ggml_backend_buffer_t) * (*n_buffers + 1));
|
||||||
|
(*buffers)[(*n_buffers)++] = buffer;
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
|
||||||
|
GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
|
||||||
|
|
||||||
|
size_t alignment = ggml_backend_buft_get_alignment(buft);
|
||||||
|
size_t max_size = ggml_backend_buft_get_max_size(buft);
|
||||||
|
|
||||||
|
ggml_backend_buffer_t * buffers = NULL;
|
||||||
|
size_t n_buffers = 0;
|
||||||
|
|
||||||
|
size_t cur_buf_size = 0;
|
||||||
|
struct ggml_tensor * first = ggml_get_first_tensor(ctx);
|
||||||
|
for (struct ggml_tensor * t = first; t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
||||||
|
size_t this_size = 0;
|
||||||
|
if (t->data == NULL && t->view_src == NULL) {
|
||||||
|
this_size = GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (this_size > max_size) {
|
||||||
|
// tensor is too large to fit in a single buffer
|
||||||
|
fprintf(stderr, "%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
|
||||||
|
__func__, t->name,
|
||||||
|
ggml_backend_buft_name(buft),
|
||||||
|
this_size, max_size);
|
||||||
|
for (size_t i = 0; i < n_buffers; i++) {
|
||||||
|
ggml_backend_buffer_free(buffers[i]);
|
||||||
|
}
|
||||||
|
free(buffers);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ((cur_buf_size + this_size) > max_size) {
|
||||||
|
// allocate tensors in the current buffer
|
||||||
|
if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
first = t;
|
||||||
|
cur_buf_size = this_size;
|
||||||
|
} else {
|
||||||
|
cur_buf_size += this_size;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// allocate remaining tensors
|
||||||
|
if (cur_buf_size > 0) {
|
||||||
|
if (!alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (n_buffers == 0) {
|
||||||
|
// all the tensors in the context are already allocated
|
||||||
|
#ifndef NDEBUG
|
||||||
|
fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
|
||||||
|
#endif
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_backend_buffer_t buffer;
|
||||||
|
if (n_buffers == 1) {
|
||||||
|
buffer = buffers[0];
|
||||||
|
} else {
|
||||||
|
buffer = ggml_backend_multi_buffer_alloc_buffer(buffers, n_buffers);
|
||||||
|
}
|
||||||
|
free(buffers);
|
||||||
return buffer;
|
return buffer;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -19,6 +19,7 @@ extern "C" {
|
||||||
const char * (*GGML_CALL get_name) (ggml_backend_buffer_type_t buft);
|
const char * (*GGML_CALL get_name) (ggml_backend_buffer_type_t buft);
|
||||||
ggml_backend_buffer_t (*GGML_CALL alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
|
ggml_backend_buffer_t (*GGML_CALL alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
|
||||||
size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
|
size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
|
||||||
|
size_t (*GGML_CALL get_max_size) (ggml_backend_buffer_type_t buft); // allocation max size
|
||||||
size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
|
size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
|
||||||
bool (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
|
bool (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
|
||||||
// check if tensor data is in host memory
|
// check if tensor data is in host memory
|
||||||
|
@ -63,6 +64,11 @@ extern "C" {
|
||||||
// do not use directly, use ggml_backend_tensor_copy instead
|
// do not use directly, use ggml_backend_tensor_copy instead
|
||||||
bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
|
bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||||
|
|
||||||
|
// buffer that contains a collection of buffers
|
||||||
|
GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
|
||||||
|
GGML_CALL bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
|
||||||
|
GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Backend
|
// Backend
|
||||||
//
|
//
|
||||||
|
|
104
ggml-backend.c
104
ggml-backend.c
|
@ -27,6 +27,14 @@ size_t ggml_backend_buft_get_alignment(ggml_backend_buffer_type_t buft) {
|
||||||
return buft->iface.get_alignment(buft);
|
return buft->iface.get_alignment(buft);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) {
|
||||||
|
// get_max_size is optional, defaults to SIZE_MAX
|
||||||
|
if (buft->iface.get_max_size) {
|
||||||
|
return buft->iface.get_max_size(buft);
|
||||||
|
}
|
||||||
|
return SIZE_MAX;
|
||||||
|
}
|
||||||
|
|
||||||
GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
|
GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
|
||||||
// get_alloc_size is optional, defaults to ggml_nbytes
|
// get_alloc_size is optional, defaults to ggml_nbytes
|
||||||
if (buft->iface.get_alloc_size) {
|
if (buft->iface.get_alloc_size) {
|
||||||
|
@ -55,8 +63,6 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
|
||||||
size_t size) {
|
size_t size) {
|
||||||
ggml_backend_buffer_t buffer = malloc(sizeof(struct ggml_backend_buffer));
|
ggml_backend_buffer_t buffer = malloc(sizeof(struct ggml_backend_buffer));
|
||||||
|
|
||||||
GGML_ASSERT(iface.get_base != NULL);
|
|
||||||
|
|
||||||
(*buffer) = (struct ggml_backend_buffer) {
|
(*buffer) = (struct ggml_backend_buffer) {
|
||||||
/* .interface = */ iface,
|
/* .interface = */ iface,
|
||||||
/* .buft = */ buft,
|
/* .buft = */ buft,
|
||||||
|
@ -106,6 +112,10 @@ size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer) {
|
||||||
return ggml_backend_buft_get_alignment(ggml_backend_buffer_get_type(buffer));
|
return ggml_backend_buft_get_alignment(ggml_backend_buffer_get_type(buffer));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
size_t ggml_backend_buffer_get_max_size(ggml_backend_buffer_t buffer) {
|
||||||
|
return ggml_backend_buft_get_max_size(ggml_backend_buffer_get_type(buffer));
|
||||||
|
}
|
||||||
|
|
||||||
size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
||||||
return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
|
return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
|
||||||
}
|
}
|
||||||
|
@ -120,6 +130,11 @@ bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
|
||||||
|
|
||||||
void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
|
void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
|
||||||
buffer->usage = usage;
|
buffer->usage = usage;
|
||||||
|
|
||||||
|
// FIXME: add a generic callback to the buffer interface
|
||||||
|
if (ggml_backend_buffer_is_multi_buffer(buffer)) {
|
||||||
|
ggml_backend_multi_buffer_set_usage(buffer, usage);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) {
|
ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) {
|
||||||
|
@ -169,6 +184,10 @@ size_t ggml_backend_get_alignment(ggml_backend_t backend) {
|
||||||
return ggml_backend_buft_get_alignment(ggml_backend_get_default_buffer_type(backend));
|
return ggml_backend_buft_get_alignment(ggml_backend_get_default_buffer_type(backend));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
size_t ggml_backend_get_max_size(ggml_backend_t backend) {
|
||||||
|
return ggml_backend_buft_get_max_size(ggml_backend_get_default_buffer_type(backend));
|
||||||
|
}
|
||||||
|
|
||||||
void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||||
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
||||||
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
||||||
|
@ -342,6 +361,11 @@ GGML_CALL static void ggml_backend_registry_init(void) {
|
||||||
extern GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
|
extern GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
|
||||||
ggml_backend_register("Metal", ggml_backend_reg_metal_init, ggml_backend_metal_buffer_type(), NULL);
|
ggml_backend_register("Metal", ggml_backend_reg_metal_init, ggml_backend_metal_buffer_type(), NULL);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef GGML_USE_VULKAN
|
||||||
|
extern GGML_CALL int ggml_backend_vk_reg_devices(void);
|
||||||
|
ggml_backend_vk_reg_devices();
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
|
GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
|
||||||
|
@ -545,6 +569,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
|
||||||
/* .get_name = */ ggml_backend_cpu_buffer_type_get_name,
|
/* .get_name = */ ggml_backend_cpu_buffer_type_get_name,
|
||||||
/* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
|
/* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
|
||||||
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
||||||
|
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
||||||
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
||||||
/* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
|
/* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
|
||||||
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
||||||
|
@ -600,6 +625,7 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
|
||||||
/* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name,
|
/* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name,
|
||||||
/* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
|
/* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
|
||||||
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
||||||
|
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
||||||
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
||||||
/* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
|
/* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
|
||||||
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
||||||
|
@ -756,6 +782,80 @@ GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, v
|
||||||
GGML_UNUSED(user_data);
|
GGML_UNUSED(user_data);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// multi-buffer buffer
|
||||||
|
|
||||||
|
struct ggml_backend_multi_buffer_context {
|
||||||
|
ggml_backend_buffer_t * buffers;
|
||||||
|
size_t n_buffers;
|
||||||
|
};
|
||||||
|
|
||||||
|
typedef struct ggml_backend_multi_buffer_context * ggml_backend_multi_buffer_context_t;
|
||||||
|
|
||||||
|
GGML_CALL static const char * ggml_backend_multi_buffer_get_name(ggml_backend_buffer_t buffer) {
|
||||||
|
ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
|
||||||
|
|
||||||
|
return ctx->buffers[0]->iface.get_name(ctx->buffers[0]);
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_CALL static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||||
|
ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
|
||||||
|
for (size_t i = 0; i < ctx->n_buffers; i++) {
|
||||||
|
ggml_backend_buffer_free(ctx->buffers[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
free(ctx->buffers);
|
||||||
|
free(ctx);
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_CALL static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
||||||
|
ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
|
||||||
|
for (size_t i = 0; i < ctx->n_buffers; i++) {
|
||||||
|
ggml_backend_buffer_clear(ctx->buffers[i], value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct ggml_backend_buffer_i ggml_backend_multi_buffer_context_interface(void) {
|
||||||
|
static struct ggml_backend_buffer_i multi_backend_buffer_i = {
|
||||||
|
/* .get_name = */ ggml_backend_multi_buffer_get_name,
|
||||||
|
/* .free_buffer = */ ggml_backend_multi_buffer_free_buffer,
|
||||||
|
/* .get_base = */ NULL,
|
||||||
|
/* .init_tensor = */ NULL,
|
||||||
|
/* .set_tensor = */ NULL,
|
||||||
|
/* .get_tensor = */ NULL,
|
||||||
|
/* .cpy_tensor = */ NULL,
|
||||||
|
/* .clear = */ ggml_backend_multi_buffer_clear,
|
||||||
|
/* .reset = */ NULL,
|
||||||
|
};
|
||||||
|
|
||||||
|
return multi_backend_buffer_i;
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers) {
|
||||||
|
ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) malloc(sizeof(struct ggml_backend_multi_buffer_context));
|
||||||
|
ctx->n_buffers = n_buffers;
|
||||||
|
ctx->buffers = (ggml_backend_buffer_t *) malloc(n_buffers * sizeof(ggml_backend_buffer_t));
|
||||||
|
|
||||||
|
size_t total_size = 0;
|
||||||
|
for (size_t i = 0; i < n_buffers; i++) {
|
||||||
|
ctx->buffers[i] = buffers[i];
|
||||||
|
total_size += ggml_backend_buffer_get_size(buffers[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
return ggml_backend_buffer_init(buffers[0]->buft, ggml_backend_multi_buffer_context_interface(), ctx, total_size);
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_CALL bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer) {
|
||||||
|
return buffer->iface.get_name == ggml_backend_multi_buffer_get_name;
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
|
||||||
|
GGML_ASSERT(ggml_backend_buffer_is_multi_buffer(buffer));
|
||||||
|
ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
|
||||||
|
for (size_t i = 0; i < ctx->n_buffers; i++) {
|
||||||
|
ggml_backend_buffer_set_usage(ctx->buffers[i], usage);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
// scheduler
|
// scheduler
|
||||||
|
|
||||||
|
|
|
@ -20,6 +20,7 @@ extern "C" {
|
||||||
GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft);
|
GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft);
|
||||||
GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
|
GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
|
||||||
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
|
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
|
||||||
|
GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
|
||||||
GGML_API GGML_CALL size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
|
GGML_API GGML_CALL size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
|
||||||
GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
|
GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
|
||||||
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
|
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
|
||||||
|
@ -36,6 +37,7 @@ extern "C" {
|
||||||
GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
|
GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
|
||||||
GGML_API GGML_CALL void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
GGML_API GGML_CALL void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||||
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
|
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
|
||||||
|
GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer);
|
||||||
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||||
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
|
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
|
||||||
GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
|
GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
|
||||||
|
@ -54,6 +56,7 @@ extern "C" {
|
||||||
GGML_API ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend);
|
GGML_API ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend);
|
||||||
GGML_API ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);
|
GGML_API ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);
|
||||||
GGML_API size_t ggml_backend_get_alignment(ggml_backend_t backend);
|
GGML_API size_t ggml_backend_get_alignment(ggml_backend_t backend);
|
||||||
|
GGML_API size_t ggml_backend_get_max_size(ggml_backend_t backend);
|
||||||
|
|
||||||
GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||||
GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
||||||
|
|
|
@ -10449,6 +10449,7 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
|
||||||
/* .get_name = */ ggml_backend_cuda_buffer_type_name,
|
/* .get_name = */ ggml_backend_cuda_buffer_type_name,
|
||||||
/* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
|
/* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
|
||||||
/* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
|
/* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
|
||||||
|
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
||||||
/* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
|
/* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
|
||||||
/* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
|
/* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
|
||||||
/* .is_host = */ NULL,
|
/* .is_host = */ NULL,
|
||||||
|
@ -10724,6 +10725,7 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface
|
||||||
/* .get_name = */ ggml_backend_cuda_split_buffer_type_name,
|
/* .get_name = */ ggml_backend_cuda_split_buffer_type_name,
|
||||||
/* .alloc_buffer = */ ggml_backend_cuda_split_buffer_type_alloc_buffer,
|
/* .alloc_buffer = */ ggml_backend_cuda_split_buffer_type_alloc_buffer,
|
||||||
/* .get_alignment = */ ggml_backend_cuda_split_buffer_type_get_alignment,
|
/* .get_alignment = */ ggml_backend_cuda_split_buffer_type_get_alignment,
|
||||||
|
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
||||||
/* .get_alloc_size = */ ggml_backend_cuda_split_buffer_type_get_alloc_size,
|
/* .get_alloc_size = */ ggml_backend_cuda_split_buffer_type_get_alloc_size,
|
||||||
/* .supports_backend = */ ggml_backend_cuda_split_buffer_type_supports_backend,
|
/* .supports_backend = */ ggml_backend_cuda_split_buffer_type_supports_backend,
|
||||||
/* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
|
/* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
|
||||||
|
@ -10803,6 +10805,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
|
||||||
/* .get_name = */ ggml_backend_cuda_host_buffer_type_name,
|
/* .get_name = */ ggml_backend_cuda_host_buffer_type_name,
|
||||||
/* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
|
/* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
|
||||||
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
|
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
|
||||||
|
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
||||||
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
||||||
/* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
|
/* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
|
||||||
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
|
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
|
||||||
|
|
|
@ -2441,6 +2441,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
|
||||||
/* .get_name = */ ggml_backend_metal_buffer_type_get_name,
|
/* .get_name = */ ggml_backend_metal_buffer_type_get_name,
|
||||||
/* .alloc_buffer = */ ggml_backend_metal_buffer_type_alloc_buffer,
|
/* .alloc_buffer = */ ggml_backend_metal_buffer_type_alloc_buffer,
|
||||||
/* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment,
|
/* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment,
|
||||||
|
/* .get_max_size = */ NULL, // TODO: return device.maxBufferLength
|
||||||
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
||||||
/* .supports_backend = */ ggml_backend_metal_buffer_type_supports_backend,
|
/* .supports_backend = */ ggml_backend_metal_buffer_type_supports_backend,
|
||||||
/* .is_host = */ ggml_backend_metal_buffer_type_is_host,
|
/* .is_host = */ ggml_backend_metal_buffer_type_is_host,
|
||||||
|
|
|
@ -2055,6 +2055,7 @@ static ggml_backend_buffer_type_i ggml_backend_opencl_buffer_type_interface = {
|
||||||
/* .get_name = */ ggml_backend_opencl_buffer_type_name,
|
/* .get_name = */ ggml_backend_opencl_buffer_type_name,
|
||||||
/* .alloc_buffer = */ ggml_backend_opencl_buffer_type_alloc_buffer,
|
/* .alloc_buffer = */ ggml_backend_opencl_buffer_type_alloc_buffer,
|
||||||
/* .get_alignment = */ ggml_backend_opencl_buffer_type_get_alignment,
|
/* .get_alignment = */ ggml_backend_opencl_buffer_type_get_alignment,
|
||||||
|
/* .get_max_size = */ NULL, // TODO: return from device info
|
||||||
/* .get_alloc_size = */ NULL,
|
/* .get_alloc_size = */ NULL,
|
||||||
/* .supports_backend = */ ggml_backend_opencl_buffer_type_supports_backend,
|
/* .supports_backend = */ ggml_backend_opencl_buffer_type_supports_backend,
|
||||||
/* .is_host = */ NULL,
|
/* .is_host = */ NULL,
|
||||||
|
@ -2111,6 +2112,7 @@ ggml_backend_buffer_type_t ggml_backend_opencl_host_buffer_type() {
|
||||||
/* .get_name = */ ggml_backend_opencl_host_buffer_type_name,
|
/* .get_name = */ ggml_backend_opencl_host_buffer_type_name,
|
||||||
/* .alloc_buffer = */ ggml_backend_opencl_host_buffer_type_alloc_buffer,
|
/* .alloc_buffer = */ ggml_backend_opencl_host_buffer_type_alloc_buffer,
|
||||||
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
|
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
|
||||||
|
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
||||||
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
||||||
/* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
|
/* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
|
||||||
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
|
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
|
||||||
|
|
61447
ggml-vulkan-shaders.hpp
Normal file
61447
ggml-vulkan-shaders.hpp
Normal file
File diff suppressed because it is too large
Load diff
5141
ggml-vulkan.cpp
Normal file
5141
ggml-vulkan.cpp
Normal file
File diff suppressed because it is too large
Load diff
46
ggml-vulkan.h
Normal file
46
ggml-vulkan.h
Normal file
|
@ -0,0 +1,46 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "ggml.h"
|
||||||
|
#include "ggml-backend.h"
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define GGML_VK_NAME "Vulkan"
|
||||||
|
|
||||||
|
GGML_API void ggml_vk_init(void);
|
||||||
|
|
||||||
|
GGML_API void ggml_vk_preallocate_buffers_graph(struct ggml_tensor * node);
|
||||||
|
GGML_API void ggml_vk_preallocate_buffers(void);
|
||||||
|
GGML_API void ggml_vk_build_graph(struct ggml_tensor * node, bool last_node);
|
||||||
|
GGML_API bool ggml_vk_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
|
||||||
|
#ifdef GGML_VULKAN_CHECK_RESULTS
|
||||||
|
void ggml_vk_check_results_0(struct ggml_compute_params * params, struct ggml_tensor * tensor);
|
||||||
|
void ggml_vk_check_results_1(struct ggml_compute_params * params, struct ggml_tensor * tensor);
|
||||||
|
#endif
|
||||||
|
GGML_API void ggml_vk_graph_cleanup(void);
|
||||||
|
|
||||||
|
GGML_API void * ggml_vk_host_malloc(size_t size);
|
||||||
|
GGML_API void ggml_vk_host_free(void * ptr);
|
||||||
|
|
||||||
|
GGML_API void ggml_vk_transform_tensor_temporary(const void * data, struct ggml_tensor * tensor);
|
||||||
|
GGML_API void ggml_vk_transform_tensor_static(const void * data, struct ggml_tensor * tensor);
|
||||||
|
GGML_API void ggml_vk_assign_buffer(struct ggml_tensor * tensor);
|
||||||
|
GGML_API void ggml_vk_prepare_tensor(struct ggml_tensor * tensor);
|
||||||
|
GGML_API void ggml_vk_cleanup(void);
|
||||||
|
|
||||||
|
GGML_API bool ggml_vk_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, const struct ggml_tensor * dst);
|
||||||
|
|
||||||
|
// backend API
|
||||||
|
GGML_API GGML_CALL ggml_backend_t ggml_backend_vk_init(void);
|
||||||
|
|
||||||
|
GGML_API GGML_CALL bool ggml_backend_is_vk(ggml_backend_t backend);
|
||||||
|
|
||||||
|
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(void);
|
||||||
|
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
|
||||||
|
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void);
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
45
ggml.c
45
ggml.c
|
@ -248,6 +248,8 @@ inline static void * ggml_aligned_malloc(size_t size) {
|
||||||
#include "ggml-cuda.h"
|
#include "ggml-cuda.h"
|
||||||
#elif defined(GGML_USE_CLBLAST)
|
#elif defined(GGML_USE_CLBLAST)
|
||||||
#include "ggml-opencl.h"
|
#include "ggml-opencl.h"
|
||||||
|
#elif defined(GGML_USE_VULKAN)
|
||||||
|
#include "ggml-vulkan.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// floating point type used to accumulate sums
|
// floating point type used to accumulate sums
|
||||||
|
@ -2293,6 +2295,8 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
||||||
ggml_init_cublas();
|
ggml_init_cublas();
|
||||||
#elif defined(GGML_USE_CLBLAST)
|
#elif defined(GGML_USE_CLBLAST)
|
||||||
ggml_cl_init();
|
ggml_cl_init();
|
||||||
|
#elif defined(GGML_USE_VULKAN)
|
||||||
|
ggml_vk_init();
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
ggml_setup_op_has_task_pass();
|
ggml_setup_op_has_task_pass();
|
||||||
|
@ -7999,7 +8003,7 @@ static void ggml_compute_forward_mul_f32(
|
||||||
const int ith = params->ith;
|
const int ith = params->ith;
|
||||||
const int nth = params->nth;
|
const int nth = params->nth;
|
||||||
|
|
||||||
#ifdef GGML_USE_CLBLAST
|
#if defined(GGML_USE_CLBLAST)
|
||||||
if (src1->backend == GGML_BACKEND_GPU) {
|
if (src1->backend == GGML_BACKEND_GPU) {
|
||||||
// TODO: OpenCL kernel support full broadcast
|
// TODO: OpenCL kernel support full broadcast
|
||||||
GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
|
GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
|
||||||
|
@ -14683,6 +14687,18 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
||||||
}
|
}
|
||||||
GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU);
|
GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU);
|
||||||
GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU);
|
GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU);
|
||||||
|
#elif defined(GGML_USE_VULKAN)
|
||||||
|
const bool skip_cpu = ggml_vk_compute_forward(params, tensor);
|
||||||
|
#ifdef GGML_VULKAN_CHECK_RESULTS
|
||||||
|
if (skip_cpu) {
|
||||||
|
ggml_vk_check_results_1(params, tensor);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
if (skip_cpu) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU);
|
||||||
|
GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU);
|
||||||
#endif // GGML_USE_CUBLAS
|
#endif // GGML_USE_CUBLAS
|
||||||
|
|
||||||
switch (tensor->op) {
|
switch (tensor->op) {
|
||||||
|
@ -17079,6 +17095,17 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef GGML_USE_VULKAN
|
||||||
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||||
|
ggml_vk_preallocate_buffers_graph(cgraph->nodes[i]);
|
||||||
|
}
|
||||||
|
ggml_vk_preallocate_buffers();
|
||||||
|
|
||||||
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||||
|
ggml_vk_build_graph(cgraph->nodes[i], i == cgraph->n_nodes - 1);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
const int n_threads = cplan->n_threads;
|
const int n_threads = cplan->n_threads;
|
||||||
|
|
||||||
struct ggml_compute_state_shared state_shared = {
|
struct ggml_compute_state_shared state_shared = {
|
||||||
|
@ -17130,6 +17157,10 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef GGML_USE_VULKAN
|
||||||
|
ggml_vk_graph_cleanup();
|
||||||
|
#endif
|
||||||
|
|
||||||
// performance stats (graph)
|
// performance stats (graph)
|
||||||
{
|
{
|
||||||
int64_t perf_cycles_cur = ggml_perf_cycles() - perf_start_cycles;
|
int64_t perf_cycles_cur = ggml_perf_cycles() - perf_start_cycles;
|
||||||
|
@ -20264,7 +20295,7 @@ int ggml_cpu_has_wasm_simd(void) {
|
||||||
}
|
}
|
||||||
|
|
||||||
int ggml_cpu_has_blas(void) {
|
int ggml_cpu_has_blas(void) {
|
||||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CLBLAST)
|
||||||
return 1;
|
return 1;
|
||||||
#else
|
#else
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -20287,8 +20318,16 @@ int ggml_cpu_has_clblast(void) {
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int ggml_cpu_has_vulkan(void) {
|
||||||
|
#if defined(GGML_USE_VULKAN)
|
||||||
|
return 1;
|
||||||
|
#else
|
||||||
|
return 0;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
int ggml_cpu_has_gpublas(void) {
|
int ggml_cpu_has_gpublas(void) {
|
||||||
return ggml_cpu_has_cublas() || ggml_cpu_has_clblast();
|
return ggml_cpu_has_cublas() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan();
|
||||||
}
|
}
|
||||||
|
|
||||||
int ggml_cpu_has_sse3(void) {
|
int ggml_cpu_has_sse3(void) {
|
||||||
|
|
1
ggml.h
1
ggml.h
|
@ -2263,6 +2263,7 @@ extern "C" {
|
||||||
GGML_API int ggml_cpu_has_blas (void);
|
GGML_API int ggml_cpu_has_blas (void);
|
||||||
GGML_API int ggml_cpu_has_cublas (void);
|
GGML_API int ggml_cpu_has_cublas (void);
|
||||||
GGML_API int ggml_cpu_has_clblast (void);
|
GGML_API int ggml_cpu_has_clblast (void);
|
||||||
|
GGML_API int ggml_cpu_has_vulkan (void);
|
||||||
GGML_API int ggml_cpu_has_gpublas (void);
|
GGML_API int ggml_cpu_has_gpublas (void);
|
||||||
GGML_API int ggml_cpu_has_sse3 (void);
|
GGML_API int ggml_cpu_has_sse3 (void);
|
||||||
GGML_API int ggml_cpu_has_ssse3 (void);
|
GGML_API int ggml_cpu_has_ssse3 (void);
|
||||||
|
|
2363
ggml_vk_generate_shaders.py
Normal file
2363
ggml_vk_generate_shaders.py
Normal file
File diff suppressed because it is too large
Load diff
22
llama.cpp
22
llama.cpp
|
@ -1,6 +1,8 @@
|
||||||
#define LLAMA_API_INTERNAL
|
#define LLAMA_API_INTERNAL
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
|
#include <iostream>
|
||||||
|
|
||||||
#include "unicode.h"
|
#include "unicode.h"
|
||||||
|
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
@ -11,6 +13,8 @@
|
||||||
# include "ggml-cuda.h"
|
# include "ggml-cuda.h"
|
||||||
#elif defined(GGML_USE_CLBLAST)
|
#elif defined(GGML_USE_CLBLAST)
|
||||||
# include "ggml-opencl.h"
|
# include "ggml-opencl.h"
|
||||||
|
#elif defined(GGML_USE_VULKAN)
|
||||||
|
# include "ggml-vulkan.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef GGML_USE_METAL
|
#ifdef GGML_USE_METAL
|
||||||
|
@ -1258,6 +1262,10 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer
|
||||||
}
|
}
|
||||||
#elif defined(GGML_USE_CPU_HBM)
|
#elif defined(GGML_USE_CPU_HBM)
|
||||||
buft = ggml_backend_cpu_hbm_buffer_type();
|
buft = ggml_backend_cpu_hbm_buffer_type();
|
||||||
|
#elif defined(GGML_USE_VULKAN)
|
||||||
|
if (host_buffer) {
|
||||||
|
buft = ggml_backend_vk_host_buffer_type();
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if (buft == nullptr) {
|
if (buft == nullptr) {
|
||||||
|
@ -1275,6 +1283,8 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
|
||||||
buft = ggml_backend_metal_buffer_type();
|
buft = ggml_backend_metal_buffer_type();
|
||||||
#elif defined(GGML_USE_CUBLAS)
|
#elif defined(GGML_USE_CUBLAS)
|
||||||
buft = ggml_backend_cuda_buffer_type(gpu);
|
buft = ggml_backend_cuda_buffer_type(gpu);
|
||||||
|
#elif defined(GGML_USE_VULKAN)
|
||||||
|
buft = ggml_backend_vk_buffer_type();
|
||||||
#elif defined(GGML_USE_CLBLAST)
|
#elif defined(GGML_USE_CLBLAST)
|
||||||
buft = ggml_backend_opencl_buffer_type();
|
buft = ggml_backend_opencl_buffer_type();
|
||||||
#endif
|
#endif
|
||||||
|
@ -6652,7 +6662,7 @@ static int llama_decode_internal(
|
||||||
}
|
}
|
||||||
|
|
||||||
const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 1;
|
const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 1;
|
||||||
if (ggml_cpu_has_cublas() && fully_offloaded) {
|
if ((ggml_cpu_has_cublas() || ggml_cpu_has_vulkan()) && fully_offloaded) {
|
||||||
n_threads = 1;
|
n_threads = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -9878,6 +9888,16 @@ struct llama_context * llama_new_context_with_model(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#elif defined(GGML_USE_VULKAN)
|
||||||
|
if (model->n_gpu_layers > 0) {
|
||||||
|
ggml_backend_t backend = ggml_backend_vk_init();
|
||||||
|
if (backend == nullptr) {
|
||||||
|
LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__);
|
||||||
|
llama_free(ctx);
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
ctx->backends.push_back(backend);
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
ctx->backend_cpu = ggml_backend_cpu_init();
|
ctx->backend_cpu = ggml_backend_cpu_init();
|
||||||
if (ctx->backend_cpu == nullptr) {
|
if (ctx->backend_cpu == nullptr) {
|
||||||
|
|
2
llama.h
2
llama.h
|
@ -46,7 +46,7 @@
|
||||||
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
||||||
#define LLAMA_SESSION_VERSION 4
|
#define LLAMA_SESSION_VERSION 4
|
||||||
|
|
||||||
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
|
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN)
|
||||||
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
||||||
#define LLAMA_SUPPORTS_GPU_OFFLOAD
|
#define LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||||
#endif
|
#endif
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue