diff --git a/otherarch/ggml_v3-cuda.cu b/otherarch/ggml_v3-cuda.cu index 8a682683d..a1a789b10 100644 --- a/otherarch/ggml_v3-cuda.cu +++ b/otherarch/ggml_v3-cuda.cu @@ -7254,7 +7254,7 @@ static void im2col_f32_f16_cuda(const float* x, half* dst, } // buffer pool for cuda -#define MAX_CUDA_BUFFERS_V3 512 +#define MAX_CUDA_BUFFERS_V3 256 struct scoped_spin_lock { std::atomic_flag& lock; diff --git a/otherarch/ggml_v3.c b/otherarch/ggml_v3.c index fb600a8c8..70213835e 100644 --- a/otherarch/ggml_v3.c +++ b/otherarch/ggml_v3.c @@ -241,6 +241,643 @@ size_t ggml_v3_hash_insert ( struct ggml_v3_hash_set hash_set, struc // return index, asserts if table is full size_t ggml_v3_hash_find_or_insert( struct ggml_v3_hash_set hash_set, struct ggml_v3_tensor * key); +//allocator stuff +#include +#include +#include +#include +#include + +#ifdef __has_include + #if __has_include() + #include + #if defined(_POSIX_MAPPED_FILES) + #include + #include + #endif + #endif +#endif + +#if defined(_WIN32) + #define WIN32_LEAN_AND_MEAN + #ifndef NOMINMAX + #define NOMINMAX + #endif + #include + #include +#endif +#define UNUSED(x) (void)(x) +#define MAX(a, b) ((a) > (b) ? (a) : (b)) +#define GGML_V3_MAX_CONCUR (2*GGML_V3_MAX_NODES) +//#define GGML_V3_ALLOCATOR_DEBUG +//#define AT_PRINTF printf +#define AT_PRINTF(...) ((void)0) +struct hash_node_v3 { + struct ggml_v3_tensor * t; + int n_children; + int n_views; +}; +static size_t hash(void * p) { + return (size_t)p % GGML_V3_GRAPH_HASHTABLE_SIZE; +} + +static struct hash_node_v3 * hash_get(struct hash_node_v3 hash_table[], struct ggml_v3_tensor * t) { + size_t h = hash(t); + + // linear probing + size_t i = h; + while (hash_table[i].t != NULL) { + if (hash_table[i].t == t) { + return &hash_table[i]; + } + i = (i + 1) % GGML_V3_GRAPH_HASHTABLE_SIZE; + if (i == h) { + // hash table is full + GGML_V3_ASSERT(false); + } + } + + hash_table[i].t = t; + return &hash_table[i]; +} + +// TODO: GGML_V3_PAD ? +static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) { + assert(alignment && !(alignment & (alignment - 1))); // power of 2 + size_t align = (alignment - (((uintptr_t)buffer + offset) % alignment)) % alignment; + return offset + align; +} + +struct free_block_v3 { + void * addr; + size_t size; +}; + +#define MAX_FREE_BLOCKS 256 + +struct ggml_v3_allocr { + void * data; + size_t size; + size_t alignment; + int n_free_block_v3s; + struct free_block_v3 free_block_v3s[MAX_FREE_BLOCKS]; + struct hash_node_v3 hash_table[GGML_V3_GRAPH_HASHTABLE_SIZE]; + size_t max_size; + bool measure; + int parse_seq[GGML_V3_MAX_CONCUR]; + int parse_seq_len; + +#ifdef GGML_V3_ALLOCATOR_DEBUG + struct ggml_v3_tensor * allocated_tensors[1024]; +#endif +}; + +#ifdef GGML_V3_ALLOCATOR_DEBUG +static void add_allocated_tensor(struct ggml_v3_allocr * alloc, struct ggml_v3_tensor * tensor) { + for (int i = 0; i < 1024; i++) { + if (alloc->allocated_tensors[i] == NULL) { + alloc->allocated_tensors[i] = tensor; + return; + } + } + GGML_V3_ASSERT(!"out of allocated_tensors"); +} +static void remove_allocated_tensor(struct ggml_v3_allocr * alloc, struct ggml_v3_tensor * tensor) { + for (int i = 0; i < 1024; i++) { + if (alloc->allocated_tensors[i] == tensor || + (alloc->allocated_tensors[i] != NULL && alloc->allocated_tensors[i]->data == tensor->data)) { + alloc->allocated_tensors[i] = NULL; + return; + } + } + printf("tried to free tensor %s not found\n", tensor->name); + GGML_V3_ASSERT(!"tensor not found"); +} +#endif + +static size_t ggml_v3_allocr_get_alloc_size(struct ggml_v3_allocr * alloc, struct ggml_v3_tensor * tensor) { + return ggml_v3_nbytes(tensor); + + UNUSED(alloc); +} + +// check if a tensor is allocated by this buffer +static bool ggml_v3_allocr_is_own(struct ggml_v3_allocr * alloc, const struct ggml_v3_tensor * tensor) { + void * ptr = tensor->data; + return ptr >= alloc->data && (char *)ptr < (char *)alloc->data + alloc->max_size; +} + +static bool ggml_v3_is_view(struct ggml_v3_tensor * t) { + return t->view_src != NULL; +} + +void ggml_v3_allocr_alloc(struct ggml_v3_allocr * alloc, struct ggml_v3_tensor * tensor) { +#ifdef GGML_V3_ALLOCATOR_DEBUG + GGML_V3_ASSERT(!ggml_v3_is_view(tensor)); // views generally get data pointer from one of their sources + GGML_V3_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated +#endif + size_t size = ggml_v3_allocr_get_alloc_size(alloc, tensor); + size = aligned_offset(NULL, size, alloc->alignment); + + AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size); + + size_t max_avail = 0; + + // find the best fitting free block besides the last block + int best_fit_block = -1; + size_t best_fit_size = SIZE_MAX; + for (int i = 0; i < alloc->n_free_block_v3s - 1; i++) { + struct free_block_v3 * block = &alloc->free_block_v3s[i]; + max_avail = MAX(max_avail, block->size); + if (block->size >= size && block->size <= best_fit_size) { + best_fit_block = i; + best_fit_size = block->size; + } + } + + AT_PRINTF("block %d\n", best_fit_block); + + if (best_fit_block == -1) { + // the last block is our last resort + struct free_block_v3 * block = &alloc->free_block_v3s[alloc->n_free_block_v3s - 1]; + max_avail = MAX(max_avail, block->size); + if (block->size >= size) { + best_fit_block = alloc->n_free_block_v3s - 1; + } else { + fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n", + __func__, size, max_avail); + GGML_V3_ASSERT(!"not enough space in the buffer"); + return; + } + } + struct free_block_v3 * block = &alloc->free_block_v3s[best_fit_block]; + void * addr = block->addr; + block->addr = (char*)block->addr + size; + block->size -= size; + if (block->size == 0) { + // remove block if empty + alloc->n_free_block_v3s--; + for (int j = best_fit_block; j < alloc->n_free_block_v3s; j++) { + alloc->free_block_v3s[j] = alloc->free_block_v3s[j+1]; + } + } + + tensor->data = addr; + AT_PRINTF("%s: allocated data at %p\n", __func__, tensor->data); + +#ifdef GGML_V3_ALLOCATOR_DEBUG + add_allocated_tensor(alloc, tensor); + size_t cur_max = (char*)addr - (char*)alloc->data + size; + if (cur_max > alloc->max_size) { + printf("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0); + for (int i = 0; i < 1024; i++) { + if (alloc->allocated_tensors[i]) { + printf("%s (%.2f MB) ", alloc->allocated_tensors[i]->name, ggml_v3_nbytes(alloc->allocated_tensors[i]) / 1024.0 / 1024.0); + } + } + printf("\n"); + } +#endif + + alloc->max_size = MAX(alloc->max_size, (char*)addr - (char*)alloc->data + size); +} + +// this is a very naive implementation, but for our case the number of free blocks should be very small +static void ggml_v3_allocr_free_tensor(struct ggml_v3_allocr * alloc, struct ggml_v3_tensor * tensor) { + void * ptr = tensor->data; + + if (ggml_v3_allocr_is_own(alloc, tensor) == false) { + // the tensor was not allocated in this buffer + // this can happen because the graph allocator will try to free weights and other tensors from different buffers + // the easiest way to deal with this is just to ignore it + return; + } + + size_t size = ggml_v3_allocr_get_alloc_size(alloc, tensor); + size = aligned_offset(NULL, size, alloc->alignment); + AT_PRINTF("%s: freeing %s at %p (%zu bytes) - n_free_block_v3s = %d\n", __func__, tensor->name, ptr, size, alloc->n_free_block_v3s); + AT_PRINTF("%s: alloc->data = %p alloc->data+alloc->size = %p alloc->data+alloc->max_size = %p\n", __func__, alloc->data, (char*)alloc->data + alloc->size, (char*)alloc->data + alloc->max_size); + +#ifdef GGML_V3_ALLOCATOR_DEBUG + remove_allocated_tensor(alloc, tensor); +#endif + + // see if we can merge with an existing block + for (int i = 0; i < alloc->n_free_block_v3s; i++) { + struct free_block_v3 * block = &alloc->free_block_v3s[i]; + // check if ptr is at the end of the block + if ((char*)block->addr + block->size == ptr) { + block->size += size; + // check if we can merge with the next block + if (i < alloc->n_free_block_v3s - 1 && (char*)block->addr + block->size == alloc->free_block_v3s[i+1].addr) { + block->size += alloc->free_block_v3s[i+1].size; + alloc->n_free_block_v3s--; + for (int j = i+1; j < alloc->n_free_block_v3s; j++) { + alloc->free_block_v3s[j] = alloc->free_block_v3s[j+1]; + } + } + return; + } + // check if ptr is at the beginning of the block + if ((char*)ptr + size == block->addr) { + block->addr = ptr; + block->size += size; + // check if we can merge with the previous block + if (i > 0 && (char*)alloc->free_block_v3s[i-1].addr + alloc->free_block_v3s[i-1].size == block->addr) { + alloc->free_block_v3s[i-1].size += block->size; + alloc->n_free_block_v3s--; + for (int j = i; j < alloc->n_free_block_v3s; j++) { + alloc->free_block_v3s[j] = alloc->free_block_v3s[j+1]; + } + } + return; + } + } + // otherwise, add a new block + GGML_V3_ASSERT(alloc->n_free_block_v3s < MAX_FREE_BLOCKS && "out of free blocks"); + // insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster) + int insert_pos = 0; + while (insert_pos < alloc->n_free_block_v3s && alloc->free_block_v3s[insert_pos].addr < ptr) { + insert_pos++; + } + // shift all blocks from insert_pos onward to make room for the new block + for (int i = alloc->n_free_block_v3s; i > insert_pos; i--) { + alloc->free_block_v3s[i] = alloc->free_block_v3s[i-1]; + } + // insert the new block + alloc->free_block_v3s[insert_pos].addr = ptr; + alloc->free_block_v3s[insert_pos].size = size; + alloc->n_free_block_v3s++; +} + +void ggml_v3_allocr_set_parse_seq(struct ggml_v3_allocr * alloc, const int * list, int n) { + for (int i = 0; i < n; i++) { + alloc->parse_seq[i] = list[i]; + } + alloc->parse_seq_len = n; +} + +void ggml_v3_allocr_reset(struct ggml_v3_allocr * alloc) { + alloc->n_free_block_v3s = 1; + size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment); + alloc->free_block_v3s[0].addr = (char *)alloc->data + align_offset; + alloc->free_block_v3s[0].size = alloc->size - align_offset; +} + +struct ggml_v3_allocr * ggml_v3_allocr_new(void * data, size_t size, size_t alignment) { + struct ggml_v3_allocr * alloc = (struct ggml_v3_allocr *)malloc(sizeof(struct ggml_v3_allocr) /* + n_free_block_v3s * sizeof(struct free_block_v3) */); + + *alloc = (struct ggml_v3_allocr){ + /*.data = */ data, + /*.size = */ size, + /*.alignment = */ alignment, + /*.n_free_block_v3s = */ 0, + /*.free_block_v3s = */ {{0}}, + /*.hash_table = */ {{0}}, + /*.max_size = */ 0, + /*.measure = */ false, + /*.parse_seq = */ {0}, + /*.parse_seq_len = */ 0, +#ifdef GGML_V3_ALLOCATOR_DEBUG + /*.allocated_tensors = */ {0}, +#endif + }; + + ggml_v3_allocr_reset(alloc); + + return alloc; +} + +// OS specific functions to allocate and free uncommitted virtual memory +static void * alloc_vmem(size_t size) { +#if defined(_WIN32) + return VirtualAlloc(NULL, size, MEM_RESERVE, PAGE_NOACCESS); +#elif defined(_POSIX_MAPPED_FILES) + void * ptr = mmap(NULL, size, PROT_NONE, MAP_PRIVATE | MAP_ANON, -1, 0); + if (ptr == MAP_FAILED) { + return NULL; + } + return ptr; +#else + // use a fixed address for other platforms + uintptr_t base_addr = (uintptr_t)-size - 0x100; + return (void *)base_addr; +#endif +} + +static void free_vmem(void * base_addr, size_t size) { +#if defined(_WIN32) + VirtualFree(base_addr, 0, MEM_RELEASE); + UNUSED(size); +#elif defined(_POSIX_MAPPED_FILES) + munmap(base_addr, size); +#else + // nothing to do + UNUSED(base_addr); + UNUSED(size); +#endif +} + +// allocate uncommitted virtual memory to measure the size of the graph +static void alloc_measure_vmem(void ** base_addr, size_t * size) { + // 128GB for 64-bit, 1GB for 32-bit + *size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<37; + do { + *base_addr = alloc_vmem(*size); + if (*base_addr != NULL) { + AT_PRINTF("allocated %.2f GB of virtual memory for measure buffer at %p\n", *size / 1024.0 / 1024.0 / 1024.0, *base_addr); + return; + } + // try again with half the size + *size /= 2; + } while (*size > 0); + + GGML_V3_ASSERT(!"failed to allocate virtual memory for measure buffer"); +} + +static void free_measure_vmem(void * base_addr, size_t size) { + free_vmem(base_addr, size); +} + +struct ggml_v3_allocr * ggml_v3_allocr_new_measure(size_t alignment) { + struct ggml_v3_allocr * alloc = (struct ggml_v3_allocr *)malloc(sizeof(struct ggml_v3_allocr) /* + n_free_block_v3s * sizeof(struct free_block_v3) */); + + void * base_addr; + size_t size; + + alloc_measure_vmem(&base_addr, &size); + + *alloc = (struct ggml_v3_allocr){ + /*.data = */ base_addr, + /*.size = */ size, + /*.alignment = */ alignment, + /*.n_free_block_v3s = */ 0, + /*.free_block_v3s = */ {{0}}, + /*.hash_table = */ {{0}}, + /*.max_size = */ 0, + /*.measure = */ true, + /*.parse_seq = */ {0}, + /*.parse_seq_len = */ 0, +#ifdef GGML_V3_ALLOCATOR_DEBUG + /*.allocated_tensors = */ {0}, +#endif + }; + + ggml_v3_allocr_reset(alloc); + + return alloc; +} + +void ggml_v3_allocr_free(struct ggml_v3_allocr * alloc) { + if (alloc->measure) { + free_measure_vmem(alloc->data, alloc->size); + } + free(alloc); +} + +bool ggml_v3_allocr_is_measure(struct ggml_v3_allocr * alloc) { + return alloc->measure; +} + +//////////// compute graph allocator + +static bool ggml_v3_are_same_layout(const struct ggml_v3_tensor * a, const struct ggml_v3_tensor * b) { + if (a->type != b->type) { + return false; + } + for (int i = 0; i < GGML_V3_MAX_DIMS; i++) { + if (a->ne[i] != b->ne[i]) { + return false; + } + if (a->nb[i] != b->nb[i]) { + return false; + } + } + return true; +} + +static bool ggml_v3_op_can_inplace(enum ggml_v3_op op) { + switch (op) { + case GGML_V3_OP_SCALE: + case GGML_V3_OP_DIAG_MASK_ZERO: + case GGML_V3_OP_DIAG_MASK_INF: + case GGML_V3_OP_ADD: + case GGML_V3_OP_ADD1: + case GGML_V3_OP_SUB: + case GGML_V3_OP_MUL: + case GGML_V3_OP_DIV: + case GGML_V3_OP_SQR: + case GGML_V3_OP_SQRT: + case GGML_V3_OP_LOG: + case GGML_V3_OP_UNARY: + case GGML_V3_OP_ROPE: + case GGML_V3_OP_RMS_NORM: + case GGML_V3_OP_SOFT_MAX: + case GGML_V3_OP_CONT: + return true; + + default: + return false; + } +} + +static void allocate_node_v3(struct ggml_v3_allocr * alloc, struct ggml_v3_tensor * node) { + struct hash_node_v3 * ht = alloc->hash_table; + if (node->data == NULL) { + if (ggml_v3_is_view(node)) { + assert(node->view_src->data != NULL); + node->data = (char *)node->view_src->data + node->view_offs; + } else { + // see if we can reuse a parent's buffer (inplace) + if (ggml_v3_op_can_inplace(node->op)) { + for (int i = 0; i < GGML_V3_MAX_SRC; i++) { + struct ggml_v3_tensor * parent = node->src[i]; + if (parent == NULL) { + break; + } + + // if the node's data is external, then we cannot re-use it + if (ggml_v3_allocr_is_own(alloc, parent) == false) { + AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data); + continue; + } + + struct hash_node_v3 * p_hn = hash_get(ht, parent); + if (parent->data != NULL && p_hn->n_children == 1 && p_hn->n_views == 0 && ggml_v3_are_same_layout(node, parent)) { + if (ggml_v3_is_view(parent)) { + struct ggml_v3_tensor * view_src = parent->view_src; + struct hash_node_v3 * view_src_hn = hash_get(ht, view_src); + if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) { + // TODO: the offset of the view parent must be kept to ensure that the op doesn't overwrite + // the parent's data that it will need later (same layout requirement). the problem is that then + // we cannot free the tensor because the original address of the allocation is lost. + // adding a view_src pointer to the tensor would solve this and simplify the code dealing with views + // for now, we only reuse the parent's data if the offset is zero (view_src->data == parent->data) + AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name); + node->data = parent->data; + return; + } + } + else { + AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name); + node->data = parent->data; + return; + } + } + } + } + ggml_v3_allocr_alloc(alloc, node); + } + } +} + +static size_t ggml_v3_allocr_alloc_graph_tensors_n( + struct ggml_v3_allocr * alloc, + struct ggml_v3_cgraph ** graphs, int n_graphs, + struct ggml_v3_tensor *** inputs, struct ggml_v3_tensor *** outputs) { + + // reset hash table + struct hash_node_v3 * ht = alloc->hash_table; + memset(ht, 0, sizeof(struct hash_node_v3) * GGML_V3_GRAPH_HASHTABLE_SIZE); + + // count number of children and views + for (int g = 0; g < n_graphs; g++) { + struct ggml_v3_cgraph * gf = graphs[g]; + for (int i = 0; i < gf->n_nodes; i++) { + struct ggml_v3_tensor * node = gf->nodes[i]; + + if (ggml_v3_is_view(node)) { + struct ggml_v3_tensor * view_src = node->view_src; + hash_get(ht, view_src)->n_views += 1; + } + + for (int j = 0; j < GGML_V3_MAX_SRC; j++) { + struct ggml_v3_tensor * parent = node->src[j]; + if (parent == NULL) { + break; + } + hash_get(ht, parent)->n_children += 1; + } + } + } + + // allocate tensors + for (int g = 0; g < n_graphs; g++) { + struct ggml_v3_cgraph * gf = graphs[g]; + AT_PRINTF("####### graph %d/%d\n", g, n_graphs); + // graph inputs are allocated first to ensure that they are not overwritten by each other + if (inputs != NULL && inputs[g] != NULL) { + for (int i = 0; inputs[g][i] != NULL; i++) { + struct ggml_v3_tensor * input = inputs[g][i]; + AT_PRINTF("input: %s\n", input->name); + allocate_node_v3(alloc, input); + } + } + // if we have parse_seq then we allocate nodes following the list, and we only free nodes at barriers + int last_barrier_pos = 0; + int n_nodes = alloc->parse_seq_len ? alloc->parse_seq_len : gf->n_nodes; + + for (int ind = 0; ind < n_nodes; ind++) { + // allocate a node if there is no parse_seq or this is not a barrier + if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] != -1) { + int i = alloc->parse_seq_len ? alloc->parse_seq[ind] : ind; + struct ggml_v3_tensor * node = gf->nodes[i]; + + // allocate parents (leafs) + for (int j = 0; j < GGML_V3_MAX_SRC; j++) { + struct ggml_v3_tensor * parent = node->src[j]; + if (parent == NULL) { + break; + } + allocate_node_v3(alloc, parent); + } + + // allocate node + allocate_node_v3(alloc, node); + + AT_PRINTF("exec: %s (%s) <= ", ggml_v3_op_name(node->op), node->name); + for (int j = 0; j < GGML_V3_MAX_SRC; j++) { + struct ggml_v3_tensor * parent = node->src[j]; + if (parent == NULL) { + break; + } + AT_PRINTF("%s", parent->name); + if (j < GGML_V3_MAX_SRC - 1 && node->src[j + 1] != NULL) { + AT_PRINTF(", "); + } + } + AT_PRINTF("\n"); + } + + // update parents + // update immediately if there is no parse_seq + // update only at barriers if there is parse_seq + if ((alloc->parse_seq_len == 0) || alloc->parse_seq[ind] == -1) { + int update_start = alloc->parse_seq_len ? last_barrier_pos : ind; + int update_end = alloc->parse_seq_len ? ind : ind + 1; + for (int i = update_start; i < update_end; i++) { + int node_i = alloc->parse_seq_len ? alloc->parse_seq[i] : i; + struct ggml_v3_tensor * node = gf->nodes[node_i]; + + for (int j = 0; j < GGML_V3_MAX_SRC; j++) { + struct ggml_v3_tensor * parent = node->src[j]; + if (parent == NULL) { + break; + } + struct hash_node_v3 * p_hn = hash_get(ht, parent); + p_hn->n_children -= 1; + + //AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views); + + if (p_hn->n_children == 0 && p_hn->n_views == 0) { + if (ggml_v3_is_view(parent)) { + struct ggml_v3_tensor * view_src = parent->view_src; + struct hash_node_v3 * view_src_hn = hash_get(ht, view_src); + view_src_hn->n_views -= 1; + AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views); + if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) { + ggml_v3_allocr_free_tensor(alloc, view_src); + } + } + else { + if (parent->data != node->data) { + ggml_v3_allocr_free_tensor(alloc, parent); + } + } + } + } + } + AT_PRINTF("\n"); + if (alloc->parse_seq_len) { + last_barrier_pos = ind + 1; + } + } + } + // free graph outputs here that wouldn't be freed otherwise because they have no children + if (outputs != NULL && outputs[g] != NULL) { + for (int i = 0; outputs[g][i] != NULL; i++) { + struct ggml_v3_tensor * output = outputs[g][i]; + AT_PRINTF("output: %s\n", output->name); + ggml_v3_allocr_free_tensor(alloc, output); + } + } + } + + return alloc->max_size; +} + +size_t ggml_v3_allocr_alloc_graph(struct ggml_v3_allocr * alloc, struct ggml_v3_cgraph * graph) { + return ggml_v3_allocr_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL); +} + +size_t ggml_v3_allocr_max_size(struct ggml_v3_allocr * alloc) { + return alloc->max_size; +} + + + + + #ifdef __cplusplus } #endif diff --git a/otherarch/ggml_v3.h b/otherarch/ggml_v3.h index e8748e029..cd8ed48b1 100644 --- a/otherarch/ggml_v3.h +++ b/otherarch/ggml_v3.h @@ -2256,6 +2256,23 @@ extern "C" { GGML_V3_API ggml_v3_type_traits_t ggml_v3_internal_get_type_traits(enum ggml_v3_type type); +//allocator stuff + +GGML_V3_API struct ggml_v3_allocr * ggml_v3_allocr_new(void * data, size_t size, size_t alignment); +GGML_V3_API struct ggml_v3_allocr * ggml_v3_allocr_new_measure(size_t alignment); +// tell the allocator to parse nodes following the order described in the list +// you should call this if your graph are optimized to execute out-of-order +GGML_V3_API void ggml_v3_allocr_set_parse_seq(struct ggml_v3_allocr * alloc, const int * list, int n); +GGML_V3_API void ggml_v3_allocr_free(struct ggml_v3_allocr * alloc); +GGML_V3_API bool ggml_v3_allocr_is_measure(struct ggml_v3_allocr * alloc); +GGML_V3_API void ggml_v3_allocr_reset(struct ggml_v3_allocr * alloc); +GGML_V3_API void ggml_v3_allocr_alloc(struct ggml_v3_allocr * alloc, struct ggml_v3_tensor * tensor); +GGML_V3_API size_t ggml_v3_allocr_alloc_graph(struct ggml_v3_allocr * alloc, struct ggml_v3_cgraph * graph); +GGML_V3_API size_t ggml_v3_allocr_max_size(struct ggml_v3_allocr * alloc); + +#define GGML_V3_GRAPH_HASHTABLE_SIZE 32771 +#define GGML_V3_MAX_NODES 8192 + #ifdef __cplusplus } #endif diff --git a/otherarch/llama_v3.cpp b/otherarch/llama_v3.cpp index 6652e67fb..6bae1df99 100644 --- a/otherarch/llama_v3.cpp +++ b/otherarch/llama_v3.cpp @@ -61,14 +61,12 @@ static void llama_v3_log_callback_default(llama_v3_log_level level, const char * #define LLAMA_V3_LOG_WARN(...) llama_v3_log_internal(LLAMA_V3_LOG_LEVEL_WARN , __VA_ARGS__) #define LLAMA_V3_LOG_ERROR(...) llama_v3_log_internal(LLAMA_V3_LOG_LEVEL_ERROR, __VA_ARGS__) -// disable allocator for backwards compatibility - to avoid gguf changes messing it up -// #include "ggml-alloc.h" -// #if !defined(GGML_USE_CUBLAS) -// #define LLAMA_V3_USE_ALLOCATOR -// #else +#if !defined(GGML_USE_CUBLAS) +#define LLAMA_V3_USE_ALLOCATOR +#else #define LLAMA_V3_USE_SCRATCH #define LLAMA_V3_MAX_SCRATCH_BUFFERS 16 -// #endif +#endif // available llama models diff --git a/otherarch/otherarch.h b/otherarch/otherarch.h index 5c7deb86d..cc628e1bd 100644 --- a/otherarch/otherarch.h +++ b/otherarch/otherarch.h @@ -459,4 +459,3 @@ struct mpt_model { }; const float default_norm_eps = 1e-5f; -const size_t GGML_V3_MAX_NODES = 8192; \ No newline at end of file