mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 09:34:37 +00:00
Merge branch 'upstream' into concedo_experimental
# Conflicts: # .devops/tools.sh # build-xcframework.sh # ci/run.sh # examples/Miku.sh # examples/chat-13B.sh # examples/chat-persistent.sh # examples/chat-vicuna.sh # examples/chat.sh # examples/jeopardy/jeopardy.sh # examples/reason-act.sh # examples/server-llama2-13B.sh # examples/sycl/build.sh # examples/sycl/run-llama2.sh # examples/sycl/run-llama3.sh # examples/ts-type-to-grammar.sh # ggml/src/ggml-cpu/CMakeLists.txt # ggml/src/ggml-sycl/element_wise.cpp # ggml/src/ggml-sycl/element_wise.hpp # ggml/src/ggml-sycl/ggml-sycl.cpp # scripts/apple/validate-apps.sh # scripts/apple/validate-ios.sh # scripts/apple/validate-macos.sh # scripts/apple/validate-tvos.sh # scripts/apple/validate-visionos.sh # scripts/check-requirements.sh # scripts/ci-run.sh # scripts/compare-commits.sh # scripts/debug-test.sh # scripts/gen-authors.sh # scripts/get-hellaswag.sh # scripts/get-pg.sh # scripts/get-wikitext-103.sh # scripts/get-wikitext-2.sh # scripts/get-winogrande.sh # scripts/hf.sh # scripts/qnt-all.sh # scripts/run-all-perf.sh # scripts/run-all-ppl.sh # scripts/sync-ggml-am.sh # scripts/sync-ggml.sh # scripts/tool_bench.sh # tests/test-backend-ops.cpp # tests/test-lora-conversion-inference.sh # tests/test-tokenizer-0.sh # tools/server/README.md
This commit is contained in:
commit
cdda9d16e0
42 changed files with 1519 additions and 118 deletions
|
@ -441,6 +441,7 @@ struct vk_device_struct {
|
|||
vk_pipeline pipeline_norm_f32;
|
||||
vk_pipeline pipeline_group_norm_f32;
|
||||
vk_pipeline pipeline_rms_norm_f32;
|
||||
vk_pipeline pipeline_rms_norm_mul_f32;
|
||||
vk_pipeline pipeline_rms_norm_back_f32;
|
||||
vk_pipeline pipeline_l2_norm_f32;
|
||||
|
||||
|
@ -452,6 +453,10 @@ struct vk_device_struct {
|
|||
vk_pipeline pipeline_tanh[2];
|
||||
vk_pipeline pipeline_sigmoid[2];
|
||||
|
||||
vk_pipeline pipeline_geglu[2];
|
||||
vk_pipeline pipeline_reglu[2];
|
||||
vk_pipeline pipeline_swiglu[2];
|
||||
|
||||
vk_pipeline pipeline_leaky_relu_f32;
|
||||
vk_pipeline pipeline_silu_back_f32;
|
||||
vk_pipeline pipeline_diag_mask_inf_f32;
|
||||
|
@ -676,6 +681,13 @@ struct vk_op_push_constants {
|
|||
float param2;
|
||||
};
|
||||
|
||||
struct vk_op_glu_push_constants {
|
||||
uint32_t N;
|
||||
uint32_t ne00;
|
||||
uint32_t ne20;
|
||||
uint32_t mode; // 0: default, 1: swapped, 2: split
|
||||
};
|
||||
|
||||
struct vk_op_unary_push_constants {
|
||||
uint32_t ne;
|
||||
uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t ne03; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
|
||||
|
@ -994,6 +1006,10 @@ struct ggml_backend_vk_context {
|
|||
|
||||
vk_command_pool compute_cmd_pool;
|
||||
vk_command_pool transfer_cmd_pool;
|
||||
|
||||
// number of additional consecutive nodes that are being fused with the
|
||||
// node currently being processed
|
||||
uint32_t num_additional_fused_ops {};
|
||||
};
|
||||
|
||||
static void * const vk_ptr_base = (void *)(uintptr_t) 0x1000; // NOLINT
|
||||
|
@ -2671,7 +2687,8 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_norm_f32, "norm_f32", norm_f32_len, norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_group_norm_f32, "group_norm_f32", group_norm_f32_len, group_norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_rms_norm_f32, "rms_norm_f32", rms_norm_f32_len, rms_norm_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {1, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_rms_norm_f32, "rms_norm_f32", rms_norm_f32_len, rms_norm_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {0, 0}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_rms_norm_mul_f32, "rms_norm_mul_f32", rms_norm_f32_len, rms_norm_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {0, 1}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_rms_norm_back_f32, "rms_norm_back_f32", rms_norm_back_f32_len, rms_norm_back_f32_data, "main", 3, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_l2_norm_f32, "l2_norm_f32", l2_norm_f32_len, l2_norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
|
||||
|
||||
|
@ -2767,6 +2784,15 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|||
CREATE_UNARY(sigmoid)
|
||||
#undef CREATE_UNARY
|
||||
|
||||
#define CREATE_GLU(name) \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_ ## name [0], #name "_f32", name ## _f32_len, name ## _f32_data, "main", 3, sizeof(vk_op_glu_push_constants), {512, 1, 1}, {}, 1, true); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_ ## name [1], #name "_f16", name ## _f16_len, name ## _f16_data, "main", 3, sizeof(vk_op_glu_push_constants), {512, 1, 1}, {}, 1, true);
|
||||
|
||||
CREATE_GLU(geglu)
|
||||
CREATE_GLU(reglu)
|
||||
CREATE_GLU(swiglu)
|
||||
#undef CREATE_GLU
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_leaky_relu_f32, "leaky_relu_f32", leaky_relu_f32_len, leaky_relu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_silu_back_f32, "silu_back_f32", silu_back_f32_len, silu_back_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
|
@ -6454,7 +6480,7 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
|||
return nullptr;
|
||||
case GGML_OP_RMS_NORM:
|
||||
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
return ctx->device->pipeline_rms_norm_f32;
|
||||
return ctx->num_additional_fused_ops > 0 ? ctx->device->pipeline_rms_norm_mul_f32 : ctx->device->pipeline_rms_norm_f32;
|
||||
}
|
||||
return nullptr;
|
||||
case GGML_OP_RMS_NORM_BACK:
|
||||
|
@ -6491,6 +6517,24 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
|||
break;
|
||||
}
|
||||
return nullptr;
|
||||
case GGML_OP_GLU:
|
||||
if ((src0->type != GGML_TYPE_F32 && src0->type != GGML_TYPE_F16) ||
|
||||
(dst->type != GGML_TYPE_F32 && dst->type != GGML_TYPE_F16) ||
|
||||
(src0->type != dst->type)) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
switch (ggml_get_glu_op(dst)) {
|
||||
case GGML_GLU_OP_GEGLU:
|
||||
return ctx->device->pipeline_geglu[dst->type == GGML_TYPE_F16];
|
||||
case GGML_GLU_OP_REGLU:
|
||||
return ctx->device->pipeline_reglu[dst->type == GGML_TYPE_F16];
|
||||
case GGML_GLU_OP_SWIGLU:
|
||||
return ctx->device->pipeline_swiglu[dst->type == GGML_TYPE_F16];
|
||||
default:
|
||||
break;
|
||||
}
|
||||
return nullptr;
|
||||
case GGML_OP_DIAG_MASK_INF:
|
||||
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
return ctx->device->pipeline_diag_mask_inf_f32;
|
||||
|
@ -6951,6 +6995,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
|||
case GGML_OP_CONCAT:
|
||||
case GGML_OP_UPSCALE:
|
||||
case GGML_OP_UNARY:
|
||||
case GGML_OP_GLU:
|
||||
case GGML_OP_CONV_2D_DW:
|
||||
{
|
||||
uint32_t ne = ggml_nelements(dst);
|
||||
|
@ -6991,7 +7036,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
|||
}
|
||||
}
|
||||
|
||||
if (op == GGML_OP_SOFT_MAX) {
|
||||
if (op == GGML_OP_SOFT_MAX || op == GGML_OP_GLU) {
|
||||
// Empty src1 is possible in soft_max, but the shader needs a buffer
|
||||
vk_subbuffer subbuf_y;
|
||||
if (use_src1) {
|
||||
|
@ -7554,18 +7599,19 @@ static void ggml_vk_group_norm(ggml_backend_vk_context * ctx, vk_context& subctx
|
|||
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_GROUP_NORM, { group_size, 0, eps, 0.0f }, dryrun);
|
||||
}
|
||||
|
||||
static void ggml_vk_rms_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
||||
static void ggml_vk_rms_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
|
||||
float * op_params = (float *)dst->op_params;
|
||||
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
||||
const uint32_t src1_type_size = ggml_type_size(src1->type);
|
||||
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
||||
|
||||
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_RMS_NORM, {
|
||||
ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_RMS_NORM, {
|
||||
(uint32_t)ggml_nelements(src0),
|
||||
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
||||
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
||||
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
||||
(uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
|
||||
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
||||
0,
|
||||
op_params[0], 0.0f,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
op_params[0], 0.0f, 0,
|
||||
}, dryrun);
|
||||
}
|
||||
|
||||
|
@ -7583,6 +7629,25 @@ static void ggml_vk_unary(ggml_backend_vk_context * ctx, vk_context& subctx, con
|
|||
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_UNARY, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f }, dryrun);
|
||||
}
|
||||
|
||||
static void ggml_vk_glu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
|
||||
const bool swapped = (bool)dst->op_params[1];
|
||||
const bool split = src1 != nullptr;
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
|
||||
if (!split) {
|
||||
GGML_ASSERT(src0->ne[0] / 2 == dst->ne[0]);
|
||||
} else {
|
||||
GGML_ASSERT(src0->ne[0] == src1->ne[0]);
|
||||
GGML_ASSERT(src0->ne[0] == dst->ne[0]);
|
||||
GGML_ASSERT(src0->type == src1->type);
|
||||
}
|
||||
|
||||
const uint32_t mode = split ? 2 : (swapped ? 1 : 0);
|
||||
|
||||
ggml_vk_op_f32<vk_op_glu_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_GLU, { (uint32_t)ggml_nelements(dst), (uint32_t)src0->ne[0], (uint32_t)dst->ne[0], mode }, dryrun);
|
||||
}
|
||||
|
||||
static void ggml_vk_diag_mask_inf(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
||||
int32_t * op_params = (int32_t *)dst->op_params;
|
||||
ggml_vk_op_f32<vk_op_diag_mask_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_DIAG_MASK_INF, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0] }, dryrun);
|
||||
|
@ -8760,7 +8825,8 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context* ctx, ggml_tensor* t
|
|||
|
||||
// Returns true if node has enqueued work into the queue, false otherwise
|
||||
// If submit is true the current all operations queued so far are being submitted to Vulkan to overlap cmdlist creation and GPU execution.
|
||||
static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, int node_idx, ggml_tensor *node_begin, int node_idx_begin, bool dryrun, bool last_node, bool almost_ready, bool submit){
|
||||
static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgraph, int node_idx, ggml_tensor *node_begin, int node_idx_begin, bool dryrun, bool last_node, bool almost_ready, bool submit){
|
||||
ggml_tensor * node = cgraph->nodes[node_idx];
|
||||
if (ggml_is_empty(node) || !node->buffer) {
|
||||
return false;
|
||||
}
|
||||
|
@ -8794,6 +8860,16 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|||
return false;
|
||||
}
|
||||
break;
|
||||
case GGML_OP_GLU:
|
||||
switch (ggml_get_glu_op(node)) {
|
||||
case GGML_GLU_OP_GEGLU:
|
||||
case GGML_GLU_OP_REGLU:
|
||||
case GGML_GLU_OP_SWIGLU:
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
case GGML_OP_REPEAT:
|
||||
case GGML_OP_REPEAT_BACK:
|
||||
case GGML_OP_GET_ROWS:
|
||||
|
@ -8886,6 +8962,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|||
case GGML_OP_RMS_NORM_BACK:
|
||||
case GGML_OP_L2_NORM:
|
||||
case GGML_OP_UNARY:
|
||||
case GGML_OP_GLU:
|
||||
case GGML_OP_DIAG_MASK_INF:
|
||||
case GGML_OP_SOFT_MAX:
|
||||
case GGML_OP_SOFT_MAX_BACK:
|
||||
|
@ -8998,8 +9075,14 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|||
|
||||
break;
|
||||
case GGML_OP_RMS_NORM:
|
||||
ggml_vk_rms_norm(ctx, compute_ctx, src0, node, dryrun);
|
||||
|
||||
if (ctx->num_additional_fused_ops > 0) {
|
||||
// fused rms_norm + mul
|
||||
ggml_tensor *mul = cgraph->nodes[node_idx + 1];
|
||||
ggml_tensor *other_src = mul->src[0] == node ? mul->src[1] : mul->src[0];
|
||||
ggml_vk_rms_norm(ctx, compute_ctx, src0, other_src, mul, dryrun);
|
||||
} else {
|
||||
ggml_vk_rms_norm(ctx, compute_ctx, src0, src0, node, dryrun);
|
||||
}
|
||||
break;
|
||||
case GGML_OP_RMS_NORM_BACK:
|
||||
ggml_vk_rms_norm_back(ctx, compute_ctx, src0, src1, node, dryrun);
|
||||
|
@ -9023,6 +9106,17 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|||
return false;
|
||||
}
|
||||
break;
|
||||
case GGML_OP_GLU:
|
||||
switch (ggml_get_glu_op(node)) {
|
||||
case GGML_GLU_OP_GEGLU:
|
||||
case GGML_GLU_OP_REGLU:
|
||||
case GGML_GLU_OP_SWIGLU:
|
||||
ggml_vk_glu(ctx, compute_ctx, src0, src1, node, dryrun);
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
case GGML_OP_DIAG_MASK_INF:
|
||||
ggml_vk_diag_mask_inf(ctx, compute_ctx, src0, node, dryrun);
|
||||
|
||||
|
@ -9148,8 +9242,9 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|||
if (!ok) {
|
||||
if (node->op == GGML_OP_UNARY) {
|
||||
std::cerr << __func__ << ": error: op not supported UNARY " << node->name << " (" << ggml_unary_op_name(static_cast<ggml_unary_op>(node->op_params[0])) << ")" << std::endl;
|
||||
}
|
||||
else {
|
||||
} else if (node->op == GGML_OP_GLU) {
|
||||
std::cerr << __func__ << ": error: op not supported GLU " << node->name << " (" << ggml_glu_op_name(static_cast<ggml_glu_op>(node->op_params[0])) << ")" << std::endl;
|
||||
} else {
|
||||
std::cerr << __func__ << ": error: op not supported " << node->name << " (" << ggml_op_name(node->op) << ")" << std::endl;
|
||||
}
|
||||
}
|
||||
|
@ -9228,6 +9323,17 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
|
|||
return false;
|
||||
}
|
||||
break;
|
||||
case GGML_OP_GLU:
|
||||
switch (ggml_get_glu_op(tensor)) {
|
||||
case GGML_GLU_OP_GEGLU:
|
||||
case GGML_GLU_OP_REGLU:
|
||||
case GGML_GLU_OP_SWIGLU:
|
||||
buf = tensor->buffer;
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
case GGML_OP_MUL_MAT:
|
||||
case GGML_OP_MUL_MAT_ID:
|
||||
case GGML_OP_FLASH_ATTN_EXT:
|
||||
|
@ -9734,10 +9840,15 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
|||
|
||||
uint64_t total_mat_mul_bytes = 0;
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
ggml_vk_build_graph(ctx, cgraph->nodes[i], i, nullptr, 0, true, false, false, false);
|
||||
if (ggml_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) {
|
||||
ctx->num_additional_fused_ops = 1;
|
||||
}
|
||||
ggml_vk_build_graph(ctx, cgraph, i, nullptr, 0, true, false, false, false);
|
||||
if (cgraph->nodes[i]->op == GGML_OP_MUL_MAT || cgraph->nodes[i]->op == GGML_OP_MUL_MAT_ID) {
|
||||
total_mat_mul_bytes += ggml_nbytes(cgraph->nodes[i]->src[0]);
|
||||
}
|
||||
i += ctx->num_additional_fused_ops;
|
||||
ctx->num_additional_fused_ops = 0;
|
||||
}
|
||||
if (ctx->device->need_compiles) {
|
||||
ggml_vk_load_shaders(ctx->device);
|
||||
|
@ -9799,14 +9910,18 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
|||
mul_mat_bytes += ggml_nbytes(cgraph->nodes[i]->src[0]);
|
||||
}
|
||||
|
||||
if (ggml_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) {
|
||||
ctx->num_additional_fused_ops = 1;
|
||||
}
|
||||
|
||||
// Signal the almost_ready fence when the graph is mostly complete (< 20% remaining)
|
||||
bool almost_ready = (cgraph->n_nodes - i) < cgraph->n_nodes / 5;
|
||||
bool submit = (submitted_nodes >= nodes_per_submit) ||
|
||||
(mul_mat_bytes >= mul_mat_bytes_per_submit) ||
|
||||
(i == last_node) ||
|
||||
(i + ctx->num_additional_fused_ops == last_node) ||
|
||||
(almost_ready && !ctx->almost_ready_fence_pending);
|
||||
|
||||
bool enqueued = ggml_vk_build_graph(ctx, cgraph->nodes[i], i, cgraph->nodes[submit_node_idx], submit_node_idx, false, i == last_node, almost_ready, submit);
|
||||
bool enqueued = ggml_vk_build_graph(ctx, cgraph, i, cgraph->nodes[submit_node_idx], submit_node_idx, false, i + ctx->num_additional_fused_ops == last_node, almost_ready, submit);
|
||||
|
||||
if (vk_perf_logger_enabled) {
|
||||
if (ctx->compute_ctx.expired()) {
|
||||
|
@ -9816,7 +9931,10 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
|||
} else {
|
||||
compute_ctx = ctx->compute_ctx.lock();
|
||||
}
|
||||
compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->device->query_pool, i+1);
|
||||
// If there are fused ops, just write out timestamps for all nodes to keep the accounting simple
|
||||
for (int j = 0; j < ctx->num_additional_fused_ops + 1; ++j) {
|
||||
compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->device->query_pool, i+j+1);
|
||||
}
|
||||
}
|
||||
|
||||
if (enqueued) {
|
||||
|
@ -9838,6 +9956,8 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
|||
}
|
||||
submit_count++;
|
||||
}
|
||||
i += ctx->num_additional_fused_ops;
|
||||
ctx->num_additional_fused_ops = 0;
|
||||
}
|
||||
|
||||
if (vk_perf_logger_enabled) {
|
||||
|
@ -10012,6 +10132,19 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
|||
return false;
|
||||
}
|
||||
break;
|
||||
case GGML_OP_GLU:
|
||||
switch (ggml_get_glu_op(op)) {
|
||||
case GGML_GLU_OP_GEGLU:
|
||||
case GGML_GLU_OP_REGLU:
|
||||
case GGML_GLU_OP_SWIGLU:
|
||||
return ggml_is_contiguous(op->src[0]) &&
|
||||
(op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) &&
|
||||
(op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) &&
|
||||
(op->src[0]->type == op->type);
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
case GGML_OP_MUL_MAT:
|
||||
case GGML_OP_MUL_MAT_ID:
|
||||
{
|
||||
|
@ -10742,6 +10875,12 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) {
|
|||
std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl;
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
} else if (tensor->op == GGML_OP_GLU) {
|
||||
if (src_clone[1] == nullptr) {
|
||||
tensor_clone = ggml_glu(ggml_ctx, src_clone[0], (ggml_glu_op) tensor->op_params[0], tensor->op_params[1]);
|
||||
} else {
|
||||
tensor_clone = ggml_glu_split(ggml_ctx, src_clone[0], src_clone[1], (ggml_glu_op) tensor->op_params[0]);
|
||||
}
|
||||
} else if (tensor->op == GGML_OP_CPY || tensor->op == GGML_OP_DUP) {
|
||||
if (src1 == nullptr) {
|
||||
tensor_clone = ggml_dup(ggml_ctx, src_clone[0]);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue