diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index fbd49d13..a415745f 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1963,7 +1963,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg ggml_backend_sched_print_assignments(sched, graph); } - // swap node_backend_ids and leaf _backend_ids with prevs + // swap node_backend_ids and leaf_backend_ids with prevs { int * tmp = sched->node_backend_ids; sched->node_backend_ids = sched->prev_node_backend_ids; @@ -2205,14 +2205,14 @@ ggml_backend_sched_t ggml_backend_sched_new( // initialize hash table // FIXME: needs to be size*2 to account for leafs (do it in graph_split instead) - sched->hash_set = ggml_hash_set_new(graph_size); + sched->hash_set = ggml_hash_set_new(graph_size); sched->hv_tensor_backend_ids = (int *) malloc(sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0])); sched->hv_tensor_copies = (ggml_tensor **) malloc(sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct ggml_tensor *)); const size_t ggml_sched_max_splits = graph_size; // at most there is one split for each node in the graph - const size_t nodes_size = graph_size + ggml_sched_max_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2; - sched->node_backend_ids = (int *) calloc(nodes_size, sizeof(sched->node_backend_ids[0])); - sched->leaf_backend_ids = (int *) calloc(nodes_size, sizeof(sched->leaf_backend_ids[0])); + const size_t nodes_size = graph_size + ggml_sched_max_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2; + sched->node_backend_ids = (int *) calloc(nodes_size, sizeof(sched->node_backend_ids[0])); + sched->leaf_backend_ids = (int *) calloc(nodes_size, sizeof(sched->leaf_backend_ids[0])); sched->prev_node_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0])); sched->prev_leaf_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0])); @@ -2400,7 +2400,7 @@ void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor (char *)ggml_backend_buffer_get_base(buffer) + ggml_backend_buffer_get_size(buffer)); tensor->buffer = buffer; - tensor->data = addr; + tensor->data = addr; ggml_backend_buffer_init_tensor(buffer, tensor); } diff --git a/src/llama.cpp b/src/llama.cpp index fa6669af..1552cc9a 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -88,6 +88,7 @@ #include #include #include +#include #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data @@ -4478,6 +4479,18 @@ static size_t llama_model_max_nodes(const llama_model & model) { return std::max(8192, model.tensors_by_name.size()*5); } +static int get_layer_id(const ggml_tensor * tensor) { + std::string name(ggml_get_name(tensor)); + std::regex layer_id_regex(R"(\.([0-9]+)\.)"); + std::smatch match; + + if (std::regex_search(name, match, layer_id_regex)) { + return std::stoi(match[1].str()); + } else { + return -1; + } +} + struct llama_model_loader { int n_kv = 0; int n_tensors = 0; @@ -5085,13 +5098,9 @@ struct llama_model_loader { *addr = mapping->addr; for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) { try { - const auto * weight = get_weight(ggml_get_name(tensor)); - if (!weight) { - continue; - } - if (weight->idx != idx) { - continue; - } + const llama_tensor_weight * weight = get_weight(ggml_get_name(tensor)); + if (!weight || weight->idx != idx) continue; + *first = std::min(*first, weight->offs); *last = std::max(*last, weight->offs + ggml_nbytes(tensor)); } catch(...) { @@ -5100,6 +5109,42 @@ struct llama_model_loader { } } + void get_mapping_ranges(std::vector>& buffer_ranges, void ** addr, int idx, ggml_context * ctx) const { + GGML_ASSERT(!mappings.empty()); + const auto & mapping = mappings.at(idx); + *addr = mapping->addr; + + for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) { + try { + const llama_tensor_weight * weight = get_weight(ggml_get_name(tensor)); + if (!weight || weight->idx != idx) continue; + + size_t tensor_first = weight->offs; + size_t tensor_last = tensor_first + ggml_nbytes(tensor); + + auto it = std::lower_bound( + buffer_ranges.begin(), buffer_ranges.end(), std::make_pair(tensor_first, tensor_last), + [](const std::pair& a, const std::pair& b) { + return a.first < b.first; + } + ); + + if (it != buffer_ranges.begin() && (it - 1)->second >= tensor_first) { + --it; + it->second = std::max(it->second, tensor_last); + } else { + it = buffer_ranges.insert(it, {tensor_first, tensor_last}); + } + + while (it + 1 != buffer_ranges.end() && (it + 1)->first <= it->second) { + it->second = std::max(it->second, (it + 1)->second); + buffer_ranges.erase(it + 1); + } + } catch (...) { + } + } + } + // for backwards compatibility, does not support ggml-backend void load_data_for(struct ggml_tensor * cur) const { const auto & w = require_weight(ggml_get_name(cur)); @@ -10322,7 +10367,7 @@ struct llm_build_context { const llm_build_cb & cb) { lctx.backend_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, hparams.n_embd, batch.n_tokens); cb(lctx.backend_embd, "backend_embd", -1); - ggml_set_input(lctx.backend_embd); + // ggml_set_input(lctx.backend_embd); // set it on the device of the adjacent node return lctx.backend_embd; } @@ -10333,7 +10378,7 @@ struct llm_build_context { const llm_build_cb & cb) { lctx.out_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, hparams.n_embd, n_outputs); cb(lctx.out_embd, "out_embd", -1); - ggml_set_input(lctx.out_embd); + // ggml_set_input(lctx.out_embd); // set it on the device of the adjacent node return lctx.out_embd; } @@ -20144,9 +20189,9 @@ struct llama_context * llama_new_context_with_model( } // build worst-case graph - uint32_t n_seqs = 1; // TODO: worst-case number of sequences - uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch); - llama_token token = llama_token_bos(&ctx->model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph + uint32_t n_seqs = 1; // TODO: worst-case number of sequences + uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch); + llama_token token = llama_token_bos(&ctx->model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr}; std::vector gf = llama_build_graph(*ctx, ubatch, true);