mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-04-26 10:41:25 +00:00
hack to allow kokoro to remain functional even with much higher GGML_SCHED_MAX_SPLIT_INPUTS
This commit is contained in:
parent
707bb67b30
commit
271c4c332c
3 changed files with 18 additions and 8 deletions
|
|
@ -752,9 +752,9 @@ static bool ggml_is_view_op(enum ggml_op op) {
|
|||
#ifndef GGML_SCHED_MAX_BACKENDS
|
||||
#define GGML_SCHED_MAX_BACKENDS 16
|
||||
#endif
|
||||
//kcpp yolo fix: decreased from 30 to 16 in order to try resolve tts oom issues.
|
||||
//kcpp yolo fix: decreased from 30 to 16 in order to try resolve tts oom issues. edit: reverted, new hack to solve kokoro added as kcpp_kokoro_alloc_hack
|
||||
#ifndef GGML_SCHED_MAX_SPLIT_INPUTS
|
||||
#define GGML_SCHED_MAX_SPLIT_INPUTS 16
|
||||
#define GGML_SCHED_MAX_SPLIT_INPUTS 30
|
||||
#endif
|
||||
|
||||
#ifndef GGML_SCHED_MAX_COPIES
|
||||
|
|
@ -1731,6 +1731,9 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
|||
return GGML_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
bool kcpp_kokoro_alloc_hack = false; //the kokoro allocation is too big due to the massive graph but there is nowhere else we can patch this
|
||||
//it doesnt need such a big alloc as there are not many graph splits. So, just adjust the allocation if triggered
|
||||
|
||||
ggml_backend_sched_t ggml_backend_sched_new(
|
||||
ggml_backend_t * backends,
|
||||
ggml_backend_buffer_type_t * bufts,
|
||||
|
|
@ -1764,7 +1767,8 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
|||
sched->hv_tensor_copies = (ggml_tensor **) malloc(sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct ggml_tensor *));
|
||||
|
||||
const size_t ggml_sched_max_splits = graph_size; // at most there is one split for each node in the graph
|
||||
const size_t nodes_size = graph_size + ggml_sched_max_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2;
|
||||
const size_t alloc_mul_adjust = (kcpp_kokoro_alloc_hack?4:1); //kcpp: kokoro needs this as the graph size is too big
|
||||
const size_t nodes_size = graph_size + ggml_sched_max_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2/alloc_mul_adjust;
|
||||
sched->node_backend_ids = (int *) calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
|
||||
sched->leaf_backend_ids = (int *) calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
|
||||
sched->prev_node_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
|
||||
|
|
@ -1773,7 +1777,7 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
|||
sched->debug_graph_size = 0;
|
||||
sched->debug_prev_graph_size = 0;
|
||||
|
||||
sched->context_buffer_size = ggml_sched_max_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + ggml_graph_overhead_custom(graph_size, false);
|
||||
sched->context_buffer_size = ggml_sched_max_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor)/alloc_mul_adjust + ggml_graph_overhead_custom(graph_size, false);
|
||||
sched->context_buffer = (char *) malloc(sched->context_buffer_size);
|
||||
|
||||
const int initial_splits_capacity = 16;
|
||||
|
|
|
|||
|
|
@ -1489,7 +1489,7 @@ struct kokoro_duration_context * build_new_duration_kokoro_context(struct kokoro
|
|||
kctx->backend_cpu = ggml_backend_cpu_init();
|
||||
kctx->set_threads();
|
||||
kctx->build_schedule();
|
||||
kctx->buf_compute_meta.resize(ggml_tensor_overhead()*model->max_duration_nodes()*2 + ggml_graph_overhead_custom(model->max_duration_nodes()*2, false));
|
||||
kctx->buf_compute_meta.resize(ggml_tensor_overhead()*model->max_duration_nodes()*5 + ggml_graph_overhead_custom(model->max_duration_nodes()*5, false));
|
||||
return kctx;
|
||||
}
|
||||
|
||||
|
|
@ -1499,6 +1499,6 @@ struct kokoro_context * build_new_kokoro_context(struct kokoro_model * model, in
|
|||
kctx->backend_cpu = ggml_backend_cpu_init();
|
||||
kctx->set_threads();
|
||||
kctx->build_schedule();
|
||||
kctx->buf_compute_meta.resize(ggml_tensor_overhead()*model->max_gen_nodes()*20 + ggml_graph_overhead_custom(model->max_gen_nodes()*20, false));
|
||||
kctx->buf_compute_meta.resize(ggml_tensor_overhead()*model->max_gen_nodes()*30 + ggml_graph_overhead_custom(model->max_gen_nodes()*30, false));
|
||||
return kctx;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -312,6 +312,8 @@ struct kokoro_ubatch {
|
|||
struct kokoro_duration_response * resp = nullptr;
|
||||
};
|
||||
|
||||
extern bool kcpp_kokoro_alloc_hack;
|
||||
|
||||
struct kokoro_duration_context : runner_context {
|
||||
kokoro_duration_context(kokoro_model * model, int n_threads): runner_context(n_threads), model(model) {};
|
||||
~kokoro_duration_context() {
|
||||
|
|
@ -332,7 +334,9 @@ struct kokoro_duration_context : runner_context {
|
|||
struct ggml_tensor * token_types = nullptr;
|
||||
|
||||
void build_schedule() {
|
||||
runner_context::build_schedule(model->max_duration_nodes()*2);
|
||||
kcpp_kokoro_alloc_hack = true;
|
||||
runner_context::build_schedule(model->max_duration_nodes()*5);
|
||||
kcpp_kokoro_alloc_hack = false;
|
||||
}
|
||||
};
|
||||
|
||||
|
|
@ -410,7 +414,9 @@ struct kokoro_context : runner_context {
|
|||
struct ggml_tensor * uv_noise_data;
|
||||
|
||||
void build_schedule() {
|
||||
runner_context::build_schedule(model->max_gen_nodes()*20);
|
||||
kcpp_kokoro_alloc_hack = true;
|
||||
runner_context::build_schedule(model->max_gen_nodes()*30);
|
||||
kcpp_kokoro_alloc_hack = false;
|
||||
}
|
||||
};
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue