diff --git a/src/llama.cpp b/src/llama.cpp index 39d1cf5b..2f63c1f6 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -16419,7 +16419,6 @@ static std::vector llama_build_graph( const uint32_t n_world = lctx.cparams.n_world; const uint32_t my_rank = lctx.cparams.rank; const uint32_t * n_layer_window = lctx.cparams.n_layer_window; - const uint32_t n_layer = lctx.model.hparams.n_layer; // this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.) llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) { @@ -17444,6 +17443,10 @@ static float is_graph_loaded(struct ggml_cgraph * cgraph) { } static void manage_graph_tensors(struct ggml_cgraph * cgraph, int advice, bool force = false) { + size_t first = SIZE_MAX; + size_t last = 0; + long page_size = sysconf(_SC_PAGESIZE); + for (int i = 0; i < ggml_graph_n_leafs(cgraph); i++) { struct ggml_tensor * cur = ggml_graph_leaf(cgraph, i); @@ -17451,26 +17454,25 @@ static void manage_graph_tensors(struct ggml_cgraph * cgraph, int advice, bool f continue; } - void * addr = (void *)cur->data; - size_t size = ggml_nbytes(cur); - size_t first = (size_t)addr; - size_t last = first + size; - long page_size = sysconf(_SC_PAGESIZE); + size_t addr = reinterpret_cast(cur->data); + first = std::min(first, addr); + last = std::max(last, addr + ggml_nbytes(cur)); + } - // align addr - llama_mmap::align_range(&first, &last, page_size); - size_t len = std::max(last - first, static_cast(page_size)); + // align addr + llama_mmap::align_range(&first, &last, page_size); + size_t len = std::max(last - first, static_cast(page_size)); - // hint to load memory - posix_madvise((void *)first, len, advice); + // hint to load memory + posix_madvise(reinterpret_cast(first), len, advice); - // if advice is POSIX_MADV_WILLNEED, force to prefetch data - if (force && advice == POSIX_MADV_WILLNEED) { - volatile char * ptr = (volatile char *)first; - for (size_t off = 0; off < len; off += page_size) { - volatile char data = ptr[off]; - (void)data; - } + // if advice is POSIX_MADV_WILLNEED, force to prefetch data + if (force && advice == POSIX_MADV_WILLNEED) { + // coarse-grained prefetch + char * ptr = reinterpret_cast(first); + for (size_t off = 0; off < len; off += page_size * 32) { + volatile char data = ptr[off]; + (void)data; } } } @@ -17674,7 +17676,6 @@ static int llama_decode_internal( } ggml_cgraph * sub_gf = nullptr; - ggml_cgraph * next_gf = nullptr; const uint32_t n_layer = hparams.n_layer; const char * layer_str = nullptr; int cur_l = -1; @@ -17687,26 +17688,24 @@ static int llama_decode_internal( next_gf = gf[(i + 1) % gf.size()]; if (n_world > 1 && !(my_rank == 0 && i == 0) && !(my_rank == 0 && is_last_l)) { - { // receive data from previous nodes - timer(llama_recv_tensors); - input_tensors tensors; - const bool is_out_embd = my_rank == 0 && i == (size_t)gf.size() - 1; - tensors.sub_gf_out = is_out_embd ? lctx.out_embd : lctx.backend_embd; - tensors.inp_pos = lctx.inp_pos; - llama_recv_tensors(*lctx.recv_socket, &tensors); - - is_last_l = my_rank == 0 && i == (size_t)gf.size() - 1; - size_t buff_size = tensors.sub_gf_out->ne[0] * tensors.sub_gf_out->ne[1] * ggml_element_size(tensors.sub_gf_out); - if (!is_last_l) { - memcpy(ubatch.backend_embd, tensors.sub_gf_out->data, buff_size); - } else { - memcpy(ubatch.out_embd, tensors.sub_gf_out->data, buff_size); - } - if (my_rank != 0 && i == 0) { - buff_size = tensors.inp_pos->ne[0] * ggml_element_size(tensors.inp_pos); - memcpy(ubatch.pos, tensors.inp_pos->data, buff_size); - } + // receive data from previous nodes + input_tensors tensors; + const bool is_out_embd = my_rank == 0 && i == (size_t)gf.size() - 1; + tensors.sub_gf_out = is_out_embd ? lctx.out_embd : lctx.backend_embd; + tensors.inp_pos = lctx.inp_pos; + llama_recv_tensors(*lctx.recv_socket, &tensors); + + is_last_l = my_rank == 0 && i == (size_t)gf.size() - 1; + size_t buff_size = tensors.sub_gf_out->ne[0] * tensors.sub_gf_out->ne[1] * ggml_element_size(tensors.sub_gf_out); + if (!is_last_l) { + memcpy(ubatch.backend_embd, tensors.sub_gf_out->data, buff_size); + } else { + memcpy(ubatch.out_embd, tensors.sub_gf_out->data, buff_size); } + if (my_rank != 0 && i == 0) { + buff_size = tensors.inp_pos->ne[0] * ggml_element_size(tensors.inp_pos); + memcpy(ubatch.pos, tensors.inp_pos->data, buff_size); + } } llama_set_inputs(lctx, ubatch); @@ -17749,8 +17748,11 @@ static int llama_decode_internal( timer(manage_graph_tensors); if (n_world != 1) { manage_graph_tensors(sub_gf, POSIX_MADV_DONTNEED); - if (!(my_rank == 0 && is_last_l)) { - manage_graph_tensors(next_gf, POSIX_MADV_WILLNEED, true); + + int next_gf_id = (i + 1) % gf.size(); + manage_graph_tensors(gf[next_gf_id], POSIX_MADV_WILLNEED, true); + if (my_rank == 0 && (is_last_l || (next_gf_id == (int)gf.size() - 1))) { + manage_graph_tensors(gf[0], POSIX_MADV_WILLNEED, true); } } } @@ -20140,7 +20142,7 @@ struct llama_context * llama_new_context_with_model( } } - for (int i = 0; i < 10; ++i) { + for (int i = 0; i < 20; ++i) { ctx->sched.push_back(ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), max_nodes, pipeline_parallel)); } @@ -20151,6 +20153,7 @@ struct llama_context * llama_new_context_with_model( llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr}; std::vector gf = llama_build_graph(*ctx, ubatch, true); + GGML_ASSERT(gf.size() <= 20 && "Number of subgraphs exceeds the maximum number of schedulers\n"); ctx->sched.resize(gf.size()); // prefetch the first subgraph weights