From facb4ea7363d16292920bf33232f6e81b7dc551d Mon Sep 17 00:00:00 2001 From: Lizonghang <870644199@qq.com> Date: Wed, 22 Jan 2025 11:15:19 +0400 Subject: [PATCH] add option --keep-inp-out-in-metal and fix bugs in unmap --- common/arg.cpp | 8 ++ common/common.cpp | 1 + common/common.h | 1 + include/llama.h | 1 + src/llama.cpp | 262 +++++++++++++++++++++++----------------------- 5 files changed, 142 insertions(+), 131 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 24cc7045..602ad9f3 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -737,6 +737,14 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, params.cuda_mem = value; // in GiB } ).set_env("LLAMA_ARG_CUDA_MEM")); + // "--keep-inp-out-in-metal" is a temporary option to keep the input and output in metal + add_opt(llama_arg( + {"--keep-inp-out-in-metal"}, + format("whether to keep input and output weight in metal (default: %s)", params.keep_inp_out_in_metal ? "true" : "false"), + [](gpt_params & params) { + params.keep_inp_out_in_metal = true; + } + ).set_env("LLAMA_ARG_KEEP_INP_OUT_IN_METAL")); add_opt(llama_arg( {"-n", "--predict", "--n-predict"}, "N", format("number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict), diff --git a/common/common.cpp b/common/common.cpp index efda9d57..e86094b0 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1603,6 +1603,7 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & mparams.use_mmap = params.use_mmap; mparams.use_mlock = params.use_mlock; mparams.check_tensors = params.check_tensors; + mparams.keep_inp_out_in_metal = params.keep_inp_out_in_metal; std::copy(std::begin(params.n_layer_window), std::end(params.n_layer_window), mparams.n_layer_window); if (params.kv_overrides.empty()) { mparams.kv_overrides = NULL; diff --git a/common/common.h b/common/common.h index d8139a3c..12cf587f 100644 --- a/common/common.h +++ b/common/common.h @@ -148,6 +148,7 @@ struct gpt_params { std::string master_ip = "localhost"; // ip address of the master node std::string next_node_ip = "localhost"; // ip address of my next node bool unload = false; // unload layer weights after use or not + bool keep_inp_out_in_metal = false; // whether to keep input/output weight in metal, not by default int32_t cuda_mem = 999.0; // cuda memory to use, in GiB int32_t n_predict = -1; // new tokens to predict int32_t n_ctx = 0; // context size diff --git a/include/llama.h b/include/llama.h index a99c70b2..fe5dc85f 100644 --- a/include/llama.h +++ b/include/llama.h @@ -312,6 +312,7 @@ extern "C" { bool use_mmap; // use mmap if possible bool use_mlock; // force system to keep model in RAM bool check_tensors; // validate model tensor data + bool keep_inp_out_in_metal; // whether to keep input/output weight in metal }; // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations diff --git a/src/llama.cpp b/src/llama.cpp index 287479e4..6a94f017 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -5324,7 +5324,6 @@ struct llama_model_loader { // Returns false if cancelled by progress_callback bool load_all_data( struct ggml_context * ctx, - struct ggml_context * cpu_ctx, llama_buf_map & buffers, llama_buf_range & buffer_ranges, llama_mlocks * lmlocks, @@ -5441,111 +5440,104 @@ struct llama_model_loader { ggml_backend_name(upload_backend)); } - std::vector merged_ctxs = {ctx}; - if (cpu_ctx != ctx && cpu_ctx != nullptr) { - merged_ctxs.push_back(cpu_ctx); - } + for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) { + const auto * weight = get_weight(ggml_get_name(cur)); + if (weight == nullptr || !weight->is_needed) { + // this can happen with split experts models or this weight is not handled by this device + continue; + } - for (ggml_context * ctx0 : merged_ctxs) { - for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx0); cur != NULL; cur = ggml_get_next_tensor(ctx0, cur)) { - const auto * weight = get_weight(ggml_get_name(cur)); - if (weight == nullptr || !weight->is_needed) { - // this can happen with split experts models or this weight is not handled by this device - continue; + if (progress_callback) { + if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) { + return false; + } + } + + size_t n_size = ggml_nbytes(cur); + + if (use_mmap) { + const auto & mapping = mappings.at(weight->idx); + uint8_t * data = (uint8_t *) mapping->addr + weight->offs; + if (check_tensors) { + validation_result.emplace_back(std::async(std::launch::async, [cur, data, n_size] { + return std::make_pair(cur, ggml_validate_row_data(cur->type, data, n_size)); + })); } - if (progress_callback) { - if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) { - return false; + // find the buffer map allocated for the tensor + ggml_backend_buffer_t buf_mmap = nullptr; + auto bufs = buffers.equal_range(weight->idx); + auto ranges = buffer_ranges[ctx][weight->idx]; + + for (size_t i = 0; i < ranges.size(); ++i) { + size_t first = ranges[i].first; + size_t last = ranges[i].second; + if (weight->offs >= first && weight->offs + n_size <= last) { + auto it = bufs.first; + std::advance(it, i); + buf_mmap = it->second; + break; } } - size_t n_size = ggml_nbytes(cur); + GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated + if (buf_mmap && cur->data == nullptr) { + ggml_backend_tensor_alloc(buf_mmap, cur, data); + if (lmlocks) { + const auto & lmlock = lmlocks->at(weight->idx); + lmlock->grow_to(weight->offs + n_size); + } - if (use_mmap) { - const auto & mapping = mappings.at(weight->idx); - uint8_t * data = (uint8_t *) mapping->addr + weight->offs; + // NOTE: mmap_used is replaced by buffer_ranges + // auto & mmap_used = mmaps_used[weight->idx]; + // mmap_used.first = std::min(mmap_used.first, weight->offs); + // mmap_used.second = std::max(mmap_used.second, weight->offs + n_size); + } else { + ggml_backend_tensor_set(cur, data, 0, n_size); + } + } else { + GGML_ASSERT(weight->idx < files.size()); + const auto & file = files.at(weight->idx); + if (ggml_backend_buffer_is_host(cur->buffer)) { + file->seek(weight->offs, SEEK_SET); + file->read_raw(cur->data, n_size); if (check_tensors) { - validation_result.emplace_back(std::async(std::launch::async, [cur, data, n_size] { - return std::make_pair(cur, ggml_validate_row_data(cur->type, data, n_size)); + validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] { + return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size)); })); } - - // find the buffer map allocated for the tensor - ggml_backend_buffer_t buf_mmap = nullptr; - auto bufs = buffers.equal_range(weight->idx); - auto ranges = buffer_ranges[ctx][weight->idx]; - - for (size_t i = 0; i < ranges.size(); ++i) { - size_t first = ranges[i].first; - size_t last = ranges[i].second; - if (weight->offs >= first && weight->offs + n_size <= last) { - auto it = bufs.first; - std::advance(it, i); - buf_mmap = it->second; - break; - } - } - - GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated - if (buf_mmap && cur->data == nullptr) { - ggml_backend_tensor_alloc(buf_mmap, cur, data); - if (lmlocks) { - const auto & lmlock = lmlocks->at(weight->idx); - lmlock->grow_to(weight->offs + n_size); - } - - // NOTE: mmap_used is replaced by buffer_ranges - // auto & mmap_used = mmaps_used[weight->idx]; - // mmap_used.first = std::min(mmap_used.first, weight->offs); - // mmap_used.second = std::max(mmap_used.second, weight->offs + n_size); - } else { - ggml_backend_tensor_set(cur, data, 0, n_size); - } } else { - GGML_ASSERT(weight->idx < files.size()); - const auto & file = files.at(weight->idx); - if (ggml_backend_buffer_is_host(cur->buffer)) { + // If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU. + if (upload_backend) { file->seek(weight->offs, SEEK_SET); - file->read_raw(cur->data, n_size); - if (check_tensors) { - validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] { - return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size)); - })); + + size_t bytes_read = 0; + + while (bytes_read < n_size) { + size_t read_iteration = std::min(buffer_size, n_size - bytes_read); + + ggml_backend_event_synchronize(events[buffer_idx]); + file->read_raw(host_ptrs[buffer_idx], read_iteration); + ggml_backend_tensor_set_async(upload_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration); + ggml_backend_event_record(events[buffer_idx], upload_backend); + + bytes_read += read_iteration; + ++buffer_idx; + buffer_idx %= n_buffers; } } else { - // If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU. - if (upload_backend) { - file->seek(weight->offs, SEEK_SET); - - size_t bytes_read = 0; - - while (bytes_read < n_size) { - size_t read_iteration = std::min(buffer_size, n_size - bytes_read); - - ggml_backend_event_synchronize(events[buffer_idx]); - file->read_raw(host_ptrs[buffer_idx], read_iteration); - ggml_backend_tensor_set_async(upload_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration); - ggml_backend_event_record(events[buffer_idx], upload_backend); - - bytes_read += read_iteration; - ++buffer_idx; - buffer_idx %= n_buffers; - } - } else { - read_buf.resize(n_size); - file->seek(weight->offs, SEEK_SET); - file->read_raw(read_buf.data(), n_size); - ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size); - if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) { - throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur))); - } + read_buf.resize(n_size); + file->seek(weight->offs, SEEK_SET); + file->read_raw(read_buf.data(), n_size); + ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size); + if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) { + throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur))); } } } - - size_done += n_size; } + + size_done += n_size; } // free temporary resources used for async uploads @@ -5571,46 +5563,10 @@ struct llama_model_loader { throw std::runtime_error("found tensors with invalid data"); } - // check if this is the last call and do final cleanup - if (size_done >= size_data) { - // unmap offloaded tensors and metadata - if (use_mmap) { - for (uint32_t idx = 0; idx < mappings.size(); idx++) { - auto & mapping = mappings.at(idx); - auto & ranges_used = buffer_ranges[ctx][idx]; - - std::sort(ranges_used.begin(), ranges_used.end(), []( - const std::pair & a, const std::pair & b) { - return a.first < b.first; - }); - - size_t prev_end = 0; - for (const auto & range : ranges_used) { - size_t first = range.first; - size_t last = range.second; - if (first > prev_end) { - mapping->unmap_fragment(prev_end, first); - } - prev_end = last; - } - - if (prev_end < mapping->size) { - mapping->unmap_fragment(prev_end, mapping->size); - } - - // NOTE: mmap_used is replaced by buffer_ranges - // const auto & mmap_used = mmaps_used.at(idx); - // mapping->unmap_fragment(0, mmap_used.first); - // if (mmap_used.second != 0) { - // mapping->unmap_fragment(mmap_used.second, mapping->size); - // } - } - } - if (progress_callback) { - // Even though the model is done loading, we still honor - // cancellation since we need to free allocations. - return progress_callback(1.0f, progress_callback_user_data); - } + if (progress_callback) { + // Even though the model is done loading, we still honor + // cancellation since we need to free allocations. + return progress_callback(1.0f, progress_callback_user_data); } return true; @@ -7430,6 +7386,7 @@ static bool llm_load_tensors_impl( enum llama_split_mode split_mode, int main_gpu, bool use_mlock, + bool keep_inp_out_in_metal, llama_progress_callback progress_callback, void * progress_callback_user_data) { auto & hparams = model.hparams; @@ -9315,7 +9272,7 @@ static bool llm_load_tensors_impl( void * addr = nullptr; auto & ranges = ctx_buffer_ranges[idx]; - ml.get_mapping_ranges(ranges, &addr, idx, ctx, cpu_ctx); + ml.get_mapping_ranges(ranges, &addr, idx, ctx, keep_inp_out_in_metal ? cpu_ctx : nullptr); for (const auto & range : ranges) { size_t first = range.first; @@ -9373,11 +9330,54 @@ static bool llm_load_tensors_impl( for (auto & it : ctx_bufs) { ggml_context * ctx = it.first; auto & bufs = it.second; - if (!ml.load_all_data(ctx, cpu_ctx, bufs, buffer_ranges, use_mlock ? &model.mlock_mmaps : NULL, progress_callback, progress_callback_user_data)) { + if(!ml.load_all_data(ctx, bufs, buffer_ranges, use_mlock ? &model.mlock_mmaps : NULL, progress_callback, progress_callback_user_data)) { return false; } } + // check if this is the last call and do final cleanup + if (ml.size_done >= ml.size_data) { + // unmap offloaded tensors and metadata + if (ml.use_mmap) { + for (uint32_t idx = 0; idx < ml.mappings.size(); idx++) { + auto & mapping = ml.mappings.at(idx); + + // flatten ranges from all contexts into a vector + std::vector> ranges_used; + for (auto & ctx_buf_ranges : buffer_ranges) { + auto & ranges = ctx_buf_ranges.second[idx]; + ranges_used.insert(ranges_used.end(), ranges.begin(), ranges.end()); + } + + std::sort(ranges_used.begin(), ranges_used.end(), []( + const std::pair & a, const std::pair & b) { + return a.first < b.first; + }); + + size_t prev_end = 0; + for (const auto & range : ranges_used) { + size_t first = range.first; + size_t last = range.second; + if (first > prev_end) { + mapping->unmap_fragment(prev_end, first); + } + prev_end = std::max(prev_end, last); + } + + if (prev_end < mapping->size) { + mapping->unmap_fragment(prev_end, mapping->size); + } + + // NOTE: mmap_used is replaced by buffer_ranges + // const auto & mmap_used = mmaps_used.at(idx); + // mapping->unmap_fragment(0, mmap_used.first); + // if (mmap_used.second != 0) { + // mapping->unmap_fragment(mmap_used.second, mapping->size); + // } + } + } + } + if (use_mmap_buffer) { for (auto & mapping : ml.mappings) { model.mappings.emplace_back(std::move(mapping)); @@ -9396,7 +9396,7 @@ int llm_load_tensors( try { if (!llm_load_tensors_impl( *ml, *model, params.n_world, params.rank, params.n_layer_window, params.n_gpu_layers, params.split_mode, - params.main_gpu, params.use_mlock, params.progress_callback, params.progress_callback_user_data + params.main_gpu, params.use_mlock, params.keep_inp_out_in_metal, params.progress_callback, params.progress_callback_user_data )) { return -2; }