mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 09:34:37 +00:00
All samplers moved to kcpp side
This commit is contained in:
commit
b63158005f
54 changed files with 3765 additions and 2577 deletions
|
@ -787,6 +787,9 @@ static vk_submission ggml_vk_create_submission(vk_device& device, vk_queue& q, s
|
|||
|
||||
static void ggml_vk_submit(vk_context& ctx, vk::Fence fence) {
|
||||
if (ctx->seqs.empty()) {
|
||||
if (fence) {
|
||||
ctx->q->queue.submit({}, fence);
|
||||
}
|
||||
return;
|
||||
}
|
||||
VK_LOG_DEBUG("ggml_vk_submit(" << ctx << ", " << fence << ")");
|
||||
|
@ -4616,7 +4619,7 @@ static void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context& subctx, const
|
|||
}, dryrun);
|
||||
}
|
||||
|
||||
static void ggml_vk_sin(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
||||
static void ggml_vk_sin(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
||||
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
||||
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
||||
|
||||
|
@ -4626,10 +4629,10 @@ static void ggml_vk_sin(ggml_backend_vk_context * ctx, vk_context& subctx, const
|
|||
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
||||
0,
|
||||
0.0f, 0.0f,
|
||||
});
|
||||
}, dryrun);
|
||||
}
|
||||
|
||||
static void ggml_vk_cos(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
||||
static void ggml_vk_cos(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
||||
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
||||
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
||||
|
||||
|
@ -4639,7 +4642,7 @@ static void ggml_vk_cos(ggml_backend_vk_context * ctx, vk_context& subctx, const
|
|||
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
||||
0,
|
||||
0.0f, 0.0f,
|
||||
});
|
||||
}, dryrun);
|
||||
}
|
||||
|
||||
static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
||||
|
@ -5658,11 +5661,15 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
|||
}
|
||||
}
|
||||
|
||||
static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, int node_idx, bool last_node, bool dryrun){
|
||||
static bool ggml_vk_compute_forward(ggml_backend_vk_context* ctx, ggml_tensor* tensor, int tensor_idx, bool use_fence);
|
||||
|
||||
// Returns true if node has enqueued work into the queue, false otherwise
|
||||
// If submit is true the current all operations queued so far are being submitted to Vulkan to overlap cmdlist creation and GPU execution.
|
||||
static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, int node_idx, ggml_tensor *node_begin, int node_idx_begin, bool dryrun, bool last_node, bool submit){
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
|
||||
|
||||
if (ggml_is_empty(node) || extra == nullptr) {
|
||||
return;
|
||||
return false;
|
||||
}
|
||||
|
||||
VK_LOG_DEBUG("ggml_vk_build_graph(" << node << ", " << ggml_op_name(node->op) << ")");
|
||||
|
@ -5679,7 +5686,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|||
case GGML_OP_PERMUTE:
|
||||
case GGML_OP_TRANSPOSE:
|
||||
case GGML_OP_NONE:
|
||||
return;
|
||||
return false;
|
||||
case GGML_OP_UNARY:
|
||||
switch (ggml_get_unary_op(node)) {
|
||||
case GGML_UNARY_OP_SILU:
|
||||
|
@ -5689,7 +5696,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|||
case GGML_UNARY_OP_TANH:
|
||||
break;
|
||||
default:
|
||||
return;
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
case GGML_OP_REPEAT:
|
||||
|
@ -5726,7 +5733,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|||
default:
|
||||
std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(node->op) << std::endl;
|
||||
GGML_ABORT("fatal error");
|
||||
return;
|
||||
return false;
|
||||
}
|
||||
|
||||
vk_context compute_ctx;
|
||||
|
@ -5783,11 +5790,11 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|||
|
||||
break;
|
||||
case GGML_OP_SIN:
|
||||
ggml_vk_sin(ctx, compute_ctx, src0, node);
|
||||
ggml_vk_sin(ctx, compute_ctx, src0, node, dryrun);
|
||||
|
||||
break;
|
||||
case GGML_OP_COS:
|
||||
ggml_vk_cos(ctx, compute_ctx, src0, node);
|
||||
ggml_vk_cos(ctx, compute_ctx, src0, node, dryrun);
|
||||
|
||||
break;
|
||||
case GGML_OP_CLAMP:
|
||||
|
@ -5826,7 +5833,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|||
ggml_vk_unary(ctx, compute_ctx, src0, node, dryrun);
|
||||
break;
|
||||
default:
|
||||
return;
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
case GGML_OP_DIAG_MASK_INF:
|
||||
|
@ -5870,11 +5877,11 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|||
|
||||
break;
|
||||
default:
|
||||
return;
|
||||
return false;
|
||||
}
|
||||
|
||||
if (dryrun) {
|
||||
return;
|
||||
return false;
|
||||
}
|
||||
|
||||
ctx->tensor_ctxs[node_idx] = compute_ctx;
|
||||
|
@ -5885,14 +5892,34 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|||
last_node = true;
|
||||
#endif
|
||||
|
||||
if (last_node) {
|
||||
if (submit || last_node) {
|
||||
ggml_vk_ctx_end(compute_ctx);
|
||||
compute_ctx->exit_tensor_idx = node_idx;
|
||||
|
||||
// TODO probably it'd be better to pass a exit_node flag to ggml_vk_compute_forward
|
||||
if (last_node) {
|
||||
compute_ctx->exit_tensor_idx = node_idx_begin;
|
||||
}
|
||||
else {
|
||||
compute_ctx->exit_tensor_idx = -1;
|
||||
}
|
||||
|
||||
ctx->compute_ctx.reset();
|
||||
|
||||
bool ok = ggml_vk_compute_forward(ctx, node_begin, node_idx_begin, false);
|
||||
if (!ok) {
|
||||
if (node->op == GGML_OP_UNARY) {
|
||||
std::cerr << __func__ << ": error: op not supported UNARY " << node->name << " (" << ggml_unary_op_name(static_cast<ggml_unary_op>(node->op_params[0])) << ")" << std::endl;
|
||||
}
|
||||
else {
|
||||
std::cerr << __func__ << ": error: op not supported " << node->name << " (" << ggml_op_name(node->op) << ")" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * tensor, int tensor_idx){
|
||||
static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * tensor, int tensor_idx, bool use_fence = true){
|
||||
ggml_tensor_extra_gpu * extra = nullptr;
|
||||
|
||||
switch (tensor->op) {
|
||||
|
@ -5960,40 +5987,38 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
|
|||
|
||||
VK_LOG_DEBUG("ggml_vk_compute_forward(" << tensor << ", name=" << tensor->name << ", op=" << ggml_op_name(tensor->op) << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << ", view_src=" << tensor->view_src << ", view_offs=" << tensor->view_offs << ")");
|
||||
|
||||
#ifdef GGML_VULKAN_CHECK_RESULTS
|
||||
ggml_vk_check_results_0(tensor);
|
||||
#endif
|
||||
|
||||
vk_context subctx = ctx->tensor_ctxs[tensor_idx].lock();
|
||||
|
||||
#ifdef GGML_VULKAN_PERF
|
||||
std::chrono::steady_clock::time_point start;
|
||||
#endif // GGML_VULKAN_PERF
|
||||
// always wait for the GPU work to be done for the last submit
|
||||
if (tensor_idx == subctx->exit_tensor_idx) {
|
||||
use_fence = true;
|
||||
}
|
||||
|
||||
// Only run if ctx hasn't been submitted yet
|
||||
if (!subctx->seqs.empty()) {
|
||||
#ifdef GGML_VULKAN_CHECK_RESULTS
|
||||
ggml_vk_check_results_0(tensor);
|
||||
use_fence = true;
|
||||
#endif
|
||||
|
||||
// Do staging buffer copies
|
||||
for (auto& cpy : subctx->in_memcpys) {
|
||||
memcpy(cpy.dst, cpy.src, cpy.n);
|
||||
}
|
||||
|
||||
#ifdef GGML_VULKAN_PERF
|
||||
start = std::chrono::steady_clock::now();
|
||||
#endif // GGML_VULKAN_PERF
|
||||
ggml_vk_submit(subctx, use_fence ? ctx->fence : vk::Fence{});
|
||||
|
||||
ggml_vk_submit(subctx, ctx->fence);
|
||||
if (use_fence) {
|
||||
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_compute_forward waitForFences");
|
||||
|
||||
ctx->device->device.resetFences({ ctx->fence });
|
||||
}
|
||||
#ifdef GGML_VULKAN_CHECK_RESULTS
|
||||
ggml_vk_check_results_1(tensor);
|
||||
#endif
|
||||
}
|
||||
|
||||
if (tensor_idx == subctx->exit_tensor_idx) {
|
||||
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_compute_forward waitForFences");
|
||||
|
||||
#ifdef GGML_VULKAN_PERF
|
||||
auto duration = std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::steady_clock::now() - start);
|
||||
ctx->device->perf_logger->log_timing(tensor, duration.count());
|
||||
#endif // GGML_VULKAN_PERF
|
||||
|
||||
ctx->device->device.resetFences({ ctx->fence });
|
||||
|
||||
// Do staging buffer copies
|
||||
for (auto& cpy : subctx->out_memcpys) {
|
||||
memcpy(cpy.dst, cpy.src, cpy.n);
|
||||
|
@ -6482,7 +6507,7 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen
|
|||
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
||||
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
ggml_vk_build_graph(ctx, cgraph->nodes[i], i, 0, true);
|
||||
ggml_vk_build_graph(ctx, cgraph->nodes[i], i, nullptr, 0, true, false, false);
|
||||
}
|
||||
ggml_vk_preallocate_buffers(ctx);
|
||||
ggml_pipeline_allocate_descriptor_sets(ctx->device);
|
||||
|
@ -6497,31 +6522,36 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen
|
|||
// Reserve tensor context space for all nodes
|
||||
ctx->tensor_ctxs.resize(cgraph->n_nodes);
|
||||
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
ggml_vk_build_graph(ctx, cgraph->nodes[i], i, i == last_node, false);
|
||||
}
|
||||
bool first_node_in_batch = true; // true if next node will be first node in a batch
|
||||
int submit_node_idx = 0; // index to first node in a batch
|
||||
|
||||
// submit work every submit_count node to overlap CPU cmdbuffer generation with GPU execution
|
||||
constexpr int submit_count = 100;
|
||||
int submitted_nodes = 0;
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
ggml_tensor * node = cgraph->nodes[i];
|
||||
|
||||
if (ggml_vk_is_empty(node)) {
|
||||
continue;
|
||||
if (first_node_in_batch) {
|
||||
submit_node_idx = i;
|
||||
}
|
||||
|
||||
bool ok = ggml_vk_compute_forward(ctx, node, i);
|
||||
if (!ok) {
|
||||
if (node->op == GGML_OP_UNARY) {
|
||||
std::cerr << __func__ << ": error: op not supported UNARY " << node->name << " (" << ggml_unary_op_name(static_cast<ggml_unary_op>(node->op_params[0])) << ")" << std::endl;
|
||||
} else {
|
||||
std::cerr << __func__ << ": error: op not supported " << node->name << " (" << ggml_op_name(node->op) << ")" << std::endl;
|
||||
bool submit = (submitted_nodes >= submit_count) || (i == last_node);
|
||||
|
||||
|
||||
bool enqueued = ggml_vk_build_graph(ctx, cgraph->nodes[i], i, cgraph->nodes[submit_node_idx], submit_node_idx, false, i == last_node, submit);
|
||||
|
||||
if (enqueued) {
|
||||
++submitted_nodes;
|
||||
|
||||
#ifndef GGML_VULKAN_CHECK_RESULTS
|
||||
if (first_node_in_batch) {
|
||||
first_node_in_batch = false;
|
||||
}
|
||||
}
|
||||
#ifdef GGML_VULKAN_CHECK_RESULTS
|
||||
else {
|
||||
ggml_vk_check_results_1(node);
|
||||
}
|
||||
#endif
|
||||
GGML_ASSERT(ok);
|
||||
}
|
||||
|
||||
if (submit) {
|
||||
first_node_in_batch = true;
|
||||
submitted_nodes = 0;
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef GGML_VULKAN_PERF
|
||||
|
@ -6602,6 +6632,7 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
|
|||
return false;
|
||||
}
|
||||
} break;
|
||||
case GGML_OP_CONT:
|
||||
case GGML_OP_CPY:
|
||||
case GGML_OP_DUP:
|
||||
{
|
||||
|
@ -6642,7 +6673,6 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
|
|||
case GGML_OP_COS:
|
||||
case GGML_OP_CLAMP:
|
||||
case GGML_OP_PAD:
|
||||
case GGML_OP_CONT:
|
||||
case GGML_OP_DIAG_MASK_INF:
|
||||
case GGML_OP_SOFT_MAX:
|
||||
case GGML_OP_ARGSORT:
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue