mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 09:34:37 +00:00
Merge branch 'upstream' into concedo_experimental
# Conflicts: # .github/workflows/bench.yml # .github/workflows/server.yml # CMakeLists.txt # Makefile # README-sycl.md # flake.lock
This commit is contained in:
commit
65bf69d104
6 changed files with 421 additions and 572 deletions
16
ggml-alloc.c
16
ggml-alloc.c
|
@ -371,16 +371,16 @@ struct ggml_gallocr {
|
||||||
};
|
};
|
||||||
|
|
||||||
ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) {
|
ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) {
|
||||||
ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(sizeof(struct ggml_gallocr), 1);
|
ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(1, sizeof(struct ggml_gallocr));
|
||||||
GGML_ASSERT(galloc != NULL);
|
GGML_ASSERT(galloc != NULL);
|
||||||
|
|
||||||
galloc->bufts = calloc(sizeof(ggml_backend_buffer_type_t) * n_bufs, 1);
|
galloc->bufts = calloc(n_bufs, sizeof(ggml_backend_buffer_type_t));
|
||||||
GGML_ASSERT(galloc->bufts != NULL);
|
GGML_ASSERT(galloc->bufts != NULL);
|
||||||
|
|
||||||
galloc->buffers = calloc(sizeof(ggml_backend_buffer_t) * n_bufs, 1);
|
galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t) * n_bufs);
|
||||||
GGML_ASSERT(galloc->buffers != NULL);
|
GGML_ASSERT(galloc->buffers != NULL);
|
||||||
|
|
||||||
galloc->buf_tallocs = calloc(sizeof(struct ggml_dyn_tallocr *) * n_bufs, 1);
|
galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
|
||||||
GGML_ASSERT(galloc->buf_tallocs != NULL);
|
GGML_ASSERT(galloc->buf_tallocs != NULL);
|
||||||
|
|
||||||
for (int i = 0; i < n_bufs; i++) {
|
for (int i = 0; i < n_bufs; i++) {
|
||||||
|
@ -646,8 +646,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
||||||
free(galloc->hash_set.keys);
|
free(galloc->hash_set.keys);
|
||||||
free(galloc->hash_values);
|
free(galloc->hash_values);
|
||||||
galloc->hash_set.size = hash_size;
|
galloc->hash_set.size = hash_size;
|
||||||
galloc->hash_set.keys = calloc(sizeof(struct ggml_tensor *), hash_size);
|
galloc->hash_set.keys = calloc(hash_size, sizeof(struct ggml_tensor *));
|
||||||
galloc->hash_values = calloc(sizeof(struct hash_node), hash_size);
|
galloc->hash_values = calloc(hash_size, sizeof(struct hash_node));
|
||||||
GGML_ASSERT(galloc->hash_set.keys != NULL);
|
GGML_ASSERT(galloc->hash_set.keys != NULL);
|
||||||
GGML_ASSERT(galloc->hash_values != NULL);
|
GGML_ASSERT(galloc->hash_values != NULL);
|
||||||
} else {
|
} else {
|
||||||
|
@ -667,7 +667,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
||||||
// set the node_allocs from the hash table
|
// set the node_allocs from the hash table
|
||||||
if (galloc->n_nodes < graph->n_nodes) {
|
if (galloc->n_nodes < graph->n_nodes) {
|
||||||
free(galloc->node_allocs);
|
free(galloc->node_allocs);
|
||||||
galloc->node_allocs = calloc(sizeof(struct node_alloc), graph->n_nodes);
|
galloc->node_allocs = calloc(graph->n_nodes, sizeof(struct node_alloc));
|
||||||
GGML_ASSERT(galloc->node_allocs != NULL);
|
GGML_ASSERT(galloc->node_allocs != NULL);
|
||||||
}
|
}
|
||||||
galloc->n_nodes = graph->n_nodes;
|
galloc->n_nodes = graph->n_nodes;
|
||||||
|
@ -697,7 +697,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
||||||
}
|
}
|
||||||
if (galloc->n_leafs < graph->n_leafs) {
|
if (galloc->n_leafs < graph->n_leafs) {
|
||||||
free(galloc->leaf_allocs);
|
free(galloc->leaf_allocs);
|
||||||
galloc->leaf_allocs = calloc(sizeof(galloc->leaf_allocs[0]), graph->n_leafs);
|
galloc->leaf_allocs = calloc(graph->n_leafs, sizeof(galloc->leaf_allocs[0]));
|
||||||
GGML_ASSERT(galloc->leaf_allocs != NULL);
|
GGML_ASSERT(galloc->leaf_allocs != NULL);
|
||||||
}
|
}
|
||||||
galloc->n_leafs = graph->n_leafs;
|
galloc->n_leafs = graph->n_leafs;
|
||||||
|
|
|
@ -1725,23 +1725,23 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
||||||
GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
|
GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
|
||||||
GGML_ASSERT(ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU
|
GGML_ASSERT(ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU
|
||||||
|
|
||||||
struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);
|
struct ggml_backend_sched * sched = calloc(1, sizeof(struct ggml_backend_sched));
|
||||||
|
|
||||||
// initialize hash table
|
// initialize hash table
|
||||||
sched->hash_set = ggml_hash_set_new(graph_size);
|
sched->hash_set = ggml_hash_set_new(graph_size);
|
||||||
sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0]), sched->hash_set.size);
|
sched->tensor_backend_id = calloc(sched->hash_set.size, sizeof(sched->tensor_backend_id[0]));
|
||||||
sched->tensor_copies = calloc(sizeof(sched->tensor_copies[0]), sched->hash_set.size);
|
sched->tensor_copies = calloc(sched->hash_set.size, sizeof(sched->tensor_copies[0]));
|
||||||
|
|
||||||
const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2;
|
const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2;
|
||||||
sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0]), nodes_size);
|
sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
|
||||||
sched->leaf_backend_ids = calloc(sizeof(sched->leaf_backend_ids[0]), nodes_size);
|
sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
|
||||||
|
|
||||||
sched->n_backends = n_backends;
|
sched->n_backends = n_backends;
|
||||||
|
|
||||||
sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
|
sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
|
||||||
|
|
||||||
const int initial_splits_capacity = 16;
|
const int initial_splits_capacity = 16;
|
||||||
sched->splits = calloc(sizeof(sched->splits[0]), initial_splits_capacity);
|
sched->splits = calloc(initial_splits_capacity, sizeof(sched->splits[0]));
|
||||||
sched->splits_capacity = initial_splits_capacity;
|
sched->splits_capacity = initial_splits_capacity;
|
||||||
|
|
||||||
for (int b = 0; b < n_backends; b++) {
|
for (int b = 0; b < n_backends; b++) {
|
||||||
|
@ -1972,10 +1972,10 @@ static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_te
|
||||||
struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
|
struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
|
||||||
struct ggml_hash_set hash_set = {
|
struct ggml_hash_set hash_set = {
|
||||||
/* .size = */ graph->visited_hash_table.size,
|
/* .size = */ graph->visited_hash_table.size,
|
||||||
/* .keys = */ calloc(sizeof(hash_set.keys[0]), graph->visited_hash_table.size) // NOLINT
|
/* .keys = */ calloc(graph->visited_hash_table.size, sizeof(hash_set.keys[0])) // NOLINT
|
||||||
};
|
};
|
||||||
struct ggml_tensor ** node_copies = calloc(sizeof(node_copies[0]), hash_set.size); // NOLINT
|
struct ggml_tensor ** node_copies = calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
|
||||||
bool * node_init = calloc(sizeof(node_init[0]), hash_set.size);
|
bool * node_init = calloc(hash_set.size, sizeof(node_init[0]));
|
||||||
|
|
||||||
struct ggml_init_params params = {
|
struct ggml_init_params params = {
|
||||||
/* .mem_size = */ ggml_tensor_overhead()*hash_set.size + ggml_graph_overhead_custom(graph->size, false),
|
/* .mem_size = */ ggml_tensor_overhead()*hash_set.size + ggml_graph_overhead_custom(graph->size, false),
|
||||||
|
|
8
ggml.c
8
ggml.c
|
@ -10826,7 +10826,7 @@ static void ggml_compute_forward_mul_mat(
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if GGML_USE_LLAMAFILE
|
#if GGML_USE_LLAMAFILE
|
||||||
if (nb10 == ggml_type_size(src1->type)) {
|
if (src1_cont) {
|
||||||
for (int64_t i13 = 0; i13 < ne13; i13++)
|
for (int64_t i13 = 0; i13 < ne13; i13++)
|
||||||
for (int64_t i12 = 0; i12 < ne12; i12++)
|
for (int64_t i12 = 0; i12 < ne12; i12++)
|
||||||
if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
|
if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
|
||||||
|
@ -10879,15 +10879,13 @@ UseGgmlGemm1:;
|
||||||
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
||||||
|
|
||||||
#if GGML_USE_LLAMAFILE
|
#if GGML_USE_LLAMAFILE
|
||||||
if (nb10 == ggml_type_size(src1->type) || src1->type != vec_dot_type) {
|
if (src1->type != vec_dot_type) {
|
||||||
for (int64_t i13 = 0; i13 < ne13; i13++)
|
for (int64_t i13 = 0; i13 < ne13; i13++)
|
||||||
for (int64_t i12 = 0; i12 < ne12; i12++)
|
for (int64_t i12 = 0; i12 < ne12; i12++)
|
||||||
if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
|
if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
|
||||||
(const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
|
(const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
|
||||||
nb01/ggml_type_size(src0->type),
|
nb01/ggml_type_size(src0->type),
|
||||||
(const char *)wdata + ggml_row_size(vec_dot_type,
|
(const char *)wdata + (i12*ne11 + i13*ne12*ne11)*row_size,
|
||||||
nb12/ggml_type_size(src1->type)*i12 +
|
|
||||||
nb13/ggml_type_size(src1->type)*i13),
|
|
||||||
row_size/ggml_type_size(vec_dot_type),
|
row_size/ggml_type_size(vec_dot_type),
|
||||||
(char *)dst->data + i12*nb2 + i13*nb3,
|
(char *)dst->data + i12*nb2 + i13*nb3,
|
||||||
nb1/ggml_type_size(dst->type),
|
nb1/ggml_type_size(dst->type),
|
||||||
|
|
|
@ -633,7 +633,7 @@ maxhordelen = 256
|
||||||
modelbusy = threading.Lock()
|
modelbusy = threading.Lock()
|
||||||
requestsinqueue = 0
|
requestsinqueue = 0
|
||||||
defaultport = 5001
|
defaultport = 5001
|
||||||
KcppVersion = "1.63"
|
KcppVersion = "1.64"
|
||||||
showdebug = True
|
showdebug = True
|
||||||
showsamplerwarning = True
|
showsamplerwarning = True
|
||||||
showmaxctxwarning = True
|
showmaxctxwarning = True
|
||||||
|
|
|
@ -4403,7 +4403,7 @@ static void llm_load_vocab(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// find EOT token: "<|eot_id|>", "<|im_emd|>", "<end_of_turn>", etc.
|
// find EOT token: "<|eot_id|>", "<|im_end|>", "<end_of_turn>", etc.
|
||||||
//
|
//
|
||||||
// TODO: convert scripts should provide this token through the KV metadata LLAMA_KV_TOKENIZER_EOT_ID
|
// TODO: convert scripts should provide this token through the KV metadata LLAMA_KV_TOKENIZER_EOT_ID
|
||||||
// for now, we apply this workaround to find the EOT token based on its text
|
// for now, we apply this workaround to find the EOT token based on its text
|
||||||
|
@ -4414,7 +4414,7 @@ static void llm_load_vocab(
|
||||||
// need to fix convert script
|
// need to fix convert script
|
||||||
//vocab.id_to_token[t.second].type == LLAMA_TOKEN_TYPE_CONTROL &&
|
//vocab.id_to_token[t.second].type == LLAMA_TOKEN_TYPE_CONTROL &&
|
||||||
(t.first == "<|eot_id|>" ||
|
(t.first == "<|eot_id|>" ||
|
||||||
t.first == "<|im_emd|>" ||
|
t.first == "<|im_end|>" ||
|
||||||
t.first == "<end_of_turn>"
|
t.first == "<end_of_turn>"
|
||||||
)
|
)
|
||||||
) {
|
) {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue