Merge branch 'upstream' into concedo_experimental

# Conflicts:
#	.github/workflows/bench.yml
#	.github/workflows/server.yml
#	CMakeLists.txt
#	Makefile
#	README-sycl.md
#	flake.lock
This commit is contained in:
Concedo 2024-04-23 22:37:42 +08:00
commit 65bf69d104
6 changed files with 421 additions and 572 deletions

View file

@ -371,16 +371,16 @@ struct ggml_gallocr {
};
ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) {
ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(sizeof(struct ggml_gallocr), 1);
ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(1, sizeof(struct ggml_gallocr));
GGML_ASSERT(galloc != NULL);
galloc->bufts = calloc(sizeof(ggml_backend_buffer_type_t) * n_bufs, 1);
galloc->bufts = calloc(n_bufs, sizeof(ggml_backend_buffer_type_t));
GGML_ASSERT(galloc->bufts != NULL);
galloc->buffers = calloc(sizeof(ggml_backend_buffer_t) * n_bufs, 1);
galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t) * n_bufs);
GGML_ASSERT(galloc->buffers != NULL);
galloc->buf_tallocs = calloc(sizeof(struct ggml_dyn_tallocr *) * n_bufs, 1);
galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
GGML_ASSERT(galloc->buf_tallocs != NULL);
for (int i = 0; i < n_bufs; i++) {
@ -646,8 +646,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
free(galloc->hash_set.keys);
free(galloc->hash_values);
galloc->hash_set.size = hash_size;
galloc->hash_set.keys = calloc(sizeof(struct ggml_tensor *), hash_size);
galloc->hash_values = calloc(sizeof(struct hash_node), hash_size);
galloc->hash_set.keys = calloc(hash_size, sizeof(struct ggml_tensor *));
galloc->hash_values = calloc(hash_size, sizeof(struct hash_node));
GGML_ASSERT(galloc->hash_set.keys != NULL);
GGML_ASSERT(galloc->hash_values != NULL);
} else {
@ -667,7 +667,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
// set the node_allocs from the hash table
if (galloc->n_nodes < graph->n_nodes) {
free(galloc->node_allocs);
galloc->node_allocs = calloc(sizeof(struct node_alloc), graph->n_nodes);
galloc->node_allocs = calloc(graph->n_nodes, sizeof(struct node_alloc));
GGML_ASSERT(galloc->node_allocs != NULL);
}
galloc->n_nodes = graph->n_nodes;
@ -697,7 +697,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
}
if (galloc->n_leafs < graph->n_leafs) {
free(galloc->leaf_allocs);
galloc->leaf_allocs = calloc(sizeof(galloc->leaf_allocs[0]), graph->n_leafs);
galloc->leaf_allocs = calloc(graph->n_leafs, sizeof(galloc->leaf_allocs[0]));
GGML_ASSERT(galloc->leaf_allocs != NULL);
}
galloc->n_leafs = graph->n_leafs;

View file

@ -1725,23 +1725,23 @@ ggml_backend_sched_t ggml_backend_sched_new(
GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
GGML_ASSERT(ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU
struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);
struct ggml_backend_sched * sched = calloc(1, sizeof(struct ggml_backend_sched));
// initialize hash table
sched->hash_set = ggml_hash_set_new(graph_size);
sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0]), sched->hash_set.size);
sched->tensor_copies = calloc(sizeof(sched->tensor_copies[0]), sched->hash_set.size);
sched->tensor_backend_id = calloc(sched->hash_set.size, sizeof(sched->tensor_backend_id[0]));
sched->tensor_copies = calloc(sched->hash_set.size, sizeof(sched->tensor_copies[0]));
const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2;
sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0]), nodes_size);
sched->leaf_backend_ids = calloc(sizeof(sched->leaf_backend_ids[0]), nodes_size);
sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
sched->n_backends = n_backends;
sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
const int initial_splits_capacity = 16;
sched->splits = calloc(sizeof(sched->splits[0]), initial_splits_capacity);
sched->splits = calloc(initial_splits_capacity, sizeof(sched->splits[0]));
sched->splits_capacity = initial_splits_capacity;
for (int b = 0; b < n_backends; b++) {
@ -1972,10 +1972,10 @@ static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_te
struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
struct ggml_hash_set hash_set = {
/* .size = */ graph->visited_hash_table.size,
/* .keys = */ calloc(sizeof(hash_set.keys[0]), graph->visited_hash_table.size) // NOLINT
/* .keys = */ calloc(graph->visited_hash_table.size, sizeof(hash_set.keys[0])) // NOLINT
};
struct ggml_tensor ** node_copies = calloc(sizeof(node_copies[0]), hash_set.size); // NOLINT
bool * node_init = calloc(sizeof(node_init[0]), hash_set.size);
struct ggml_tensor ** node_copies = calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
bool * node_init = calloc(hash_set.size, sizeof(node_init[0]));
struct ggml_init_params params = {
/* .mem_size = */ ggml_tensor_overhead()*hash_set.size + ggml_graph_overhead_custom(graph->size, false),

8
ggml.c
View file

@ -10826,7 +10826,7 @@ static void ggml_compute_forward_mul_mat(
#endif
#if GGML_USE_LLAMAFILE
if (nb10 == ggml_type_size(src1->type)) {
if (src1_cont) {
for (int64_t i13 = 0; i13 < ne13; i13++)
for (int64_t i12 = 0; i12 < ne12; i12++)
if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
@ -10879,15 +10879,13 @@ UseGgmlGemm1:;
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
#if GGML_USE_LLAMAFILE
if (nb10 == ggml_type_size(src1->type) || src1->type != vec_dot_type) {
if (src1->type != vec_dot_type) {
for (int64_t i13 = 0; i13 < ne13; i13++)
for (int64_t i12 = 0; i12 < ne12; i12++)
if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
(const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
nb01/ggml_type_size(src0->type),
(const char *)wdata + ggml_row_size(vec_dot_type,
nb12/ggml_type_size(src1->type)*i12 +
nb13/ggml_type_size(src1->type)*i13),
(const char *)wdata + (i12*ne11 + i13*ne12*ne11)*row_size,
row_size/ggml_type_size(vec_dot_type),
(char *)dst->data + i12*nb2 + i13*nb3,
nb1/ggml_type_size(dst->type),

View file

@ -633,7 +633,7 @@ maxhordelen = 256
modelbusy = threading.Lock()
requestsinqueue = 0
defaultport = 5001
KcppVersion = "1.63"
KcppVersion = "1.64"
showdebug = True
showsamplerwarning = True
showmaxctxwarning = True

View file

@ -4403,7 +4403,7 @@ static void llm_load_vocab(
}
}
// find EOT token: "<|eot_id|>", "<|im_emd|>", "<end_of_turn>", etc.
// find EOT token: "<|eot_id|>", "<|im_end|>", "<end_of_turn>", etc.
//
// TODO: convert scripts should provide this token through the KV metadata LLAMA_KV_TOKENIZER_EOT_ID
// for now, we apply this workaround to find the EOT token based on its text
@ -4414,7 +4414,7 @@ static void llm_load_vocab(
// need to fix convert script
//vocab.id_to_token[t.second].type == LLAMA_TOKEN_TYPE_CONTROL &&
(t.first == "<|eot_id|>" ||
t.first == "<|im_emd|>" ||
t.first == "<|im_end|>" ||
t.first == "<end_of_turn>"
)
) {

945
sgemm.cpp

File diff suppressed because it is too large Load diff