Merge branch 'upstream' into concedo_experimental

# Conflicts:
#	tests/test-backend-ops.cpp
This commit is contained in:
Concedo 2025-06-29 15:10:26 +08:00
commit d383c03554
4 changed files with 182 additions and 88 deletions

View file

@ -321,7 +321,7 @@ static vk_device_architecture get_device_architecture(const vk::PhysicalDevice&
}
struct vk_device_struct {
std::mutex mutex;
std::recursive_mutex mutex;
vk::PhysicalDevice physical_device;
vk::PhysicalDeviceProperties properties;
@ -1213,7 +1213,7 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
}
{
std::lock_guard<std::mutex> guard(device->mutex);
std::lock_guard<std::recursive_mutex> guard(device->mutex);
device->pipelines.insert({ pipeline->name, pipeline });
}
@ -1427,7 +1427,7 @@ static uint32_t ggml_vk_find_queue_family_index(std::vector<vk::QueueFamilyPrope
static void ggml_vk_create_queue(vk_device& device, vk_queue& q, uint32_t queue_family_index, uint32_t queue_index, vk::PipelineStageFlags&& stage_flags, bool transfer_only) {
VK_LOG_DEBUG("ggml_vk_create_queue()");
std::lock_guard<std::mutex> guard(device->mutex);
std::lock_guard<std::recursive_mutex> guard(device->mutex);
q.queue_family_index = queue_family_index;
q.transfer_only = transfer_only;
@ -4148,6 +4148,7 @@ static void * ggml_vk_host_malloc(vk_device& device, size_t size) {
return nullptr;
}
std::lock_guard<std::recursive_mutex> guard(device->mutex);
device->pinned_memory.push_back(std::make_tuple(buf->ptr, size, buf));
return buf->ptr;
@ -4158,6 +4159,8 @@ static void ggml_vk_host_free(vk_device& device, void* ptr) {
return;
}
VK_LOG_MEMORY("ggml_vk_host_free(" << ptr << ")");
std::lock_guard<std::recursive_mutex> guard(device->mutex);
vk_buffer buf;
size_t index;
for (size_t i = 0; i < device->pinned_memory.size(); i++) {
@ -4180,6 +4183,7 @@ static void ggml_vk_host_free(vk_device& device, void* ptr) {
}
static void ggml_vk_host_get(vk_device& device, const void * ptr, vk_buffer& buf, size_t& buf_offset) {
std::lock_guard<std::recursive_mutex> guard(device->mutex);
buf = nullptr;
buf_offset = 0;
for (size_t i = 0; i < device->pinned_memory.size(); i++) {
@ -4481,7 +4485,7 @@ static void ggml_vk_buffer_write_2d(vk_buffer& dst, size_t offset, const void *
memcpy((uint8_t *)dst->ptr + offset + i * width, (const uint8_t *) src + i * spitch, width);
}
} else {
std::lock_guard<std::mutex> guard(dst->device->mutex);
std::lock_guard<std::recursive_mutex> guard(dst->device->mutex);
vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue.cmd_pool);
ggml_vk_ctx_begin(dst->device, subctx);
@ -4572,7 +4576,7 @@ static void ggml_vk_buffer_read(vk_buffer& src, size_t offset, void * dst, size_
memcpy(dst, (uint8_t *) src->ptr + offset, size);
} else {
std::lock_guard<std::mutex> guard(src->device->mutex);
std::lock_guard<std::recursive_mutex> guard(src->device->mutex);
vk_context subctx = ggml_vk_create_temporary_context(src->device->transfer_queue.cmd_pool);
ggml_vk_ctx_begin(src->device, subctx);
@ -4602,7 +4606,7 @@ static void ggml_vk_buffer_copy_async(vk_context& ctx, vk_buffer& dst, size_t ds
static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) {
if (src->device == dst->device) {
std::lock_guard<std::mutex> guard(src->device->mutex);
std::lock_guard<std::recursive_mutex> guard(src->device->mutex);
VK_LOG_DEBUG("ggml_vk_buffer_copy(SINGLE_DEVICE, " << size << ")");
// Copy within the device
vk_context subctx = ggml_vk_create_temporary_context(src->device->transfer_queue.cmd_pool);
@ -4637,7 +4641,7 @@ static void ggml_vk_buffer_memset_async(vk_context& ctx, vk_buffer& dst, size_t
static void ggml_vk_buffer_memset(vk_buffer& dst, size_t offset, uint32_t c, size_t size) {
VK_LOG_DEBUG("ggml_vk_buffer_memset(" << offset << ", " << c << ", " << size << ")");
std::lock_guard<std::mutex> guard(dst->device->mutex);
std::lock_guard<std::recursive_mutex> guard(dst->device->mutex);
vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue.cmd_pool);
ggml_vk_ctx_begin(dst->device, subctx);
subctx->s->buffer.fillBuffer(dst->buffer, offset, size, c);
@ -4864,9 +4868,17 @@ static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, const
// type size must be exactly 2 or 4.
GGML_ASSERT(ggml_is_quantized(to) || ggml_type_size(src->type) == 2 || ggml_type_size(src->type) == 4);
if ((ggml_type_size(src->type) % 4) == 0) {
return ctx->device->pipeline_contig_cpy_f32_f32;
if (contig) {
return ctx->device->pipeline_contig_cpy_f32_f32;
} else {
return ctx->device->pipeline_cpy_f32_f32;
}
} else {
return ctx->device->pipeline_contig_cpy_f16_f16;
if (contig) {
return ctx->device->pipeline_contig_cpy_f16_f16;
} else {
return ctx->device->pipeline_cpy_f16_f16;
}
}
}
@ -4927,7 +4939,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3];
std::cerr << "), " << (dryrun ? "dryrun" : "") << ")");
GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); // NOLINT
GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16); // NOLINT
GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT
const uint64_t ne00 = src0->ne[0];
@ -5155,7 +5167,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3];
std::cerr << "), " << (dryrun ? "dryrun" : "") << "),)");
GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); // NOLINT
GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16); // NOLINT
GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT
const uint64_t ne00 = src0->ne[0];
@ -5756,7 +5768,7 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
std::cerr << "), (" << ids << ", name=" << ids->name << ", type=" << ids->type << ", ne0=" << ids->ne[0] << ", ne1=" << ids->ne[1] << ", ne2=" << ids->ne[2] << ", ne3=" << ids->ne[3] << ", nb0=" << ids->nb[0] << ", nb1=" << ids->nb[1] << ", nb2=" << ids->nb[2] << ", nb3=" << ids->nb[3];
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3];
std::cerr << "), " << (dryrun ? "dryrun" : "") << ")");
GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); // NOLINT
GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16); // NOLINT
GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT
GGML_ASSERT(ids->type == GGML_TYPE_I32);