From 189ed92cba03b028bf74d1b4246bc6b39de85120 Mon Sep 17 00:00:00 2001 From: Lizonghang <870644199@qq.com> Date: Tue, 21 Jan 2025 21:07:02 +0400 Subject: [PATCH] segment mmap range on Metal shared memory to avoid memory waste --- src/llama.cpp | 210 ++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 152 insertions(+), 58 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 117ba1b2..d2e6e664 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -4603,7 +4603,8 @@ namespace GGUFMeta { }; } -using llama_buf_map = std::unordered_map; +using llama_buf_map = std::multimap; // +using llama_buf_range = std::map>>>; // ]> static size_t llama_model_max_nodes(const llama_model & model) { return std::max(8192, model.tensors_by_name.size()*5); @@ -4640,12 +4641,13 @@ struct llama_model_loader { // Holds information on a model weight struct llama_tensor_weight { - uint16_t idx; // source file index - size_t offs; // tensor data offset in the original file + uint16_t idx; // source file index + size_t offs; // tensor data offset in the original file + mutable bool is_needed; // whether the tensor is needed for this device ggml_tensor * tensor; - llama_tensor_weight(const llama_file * file, uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) { + llama_tensor_weight(const llama_file * file, uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), is_needed(false), tensor(tensor) { const int tensor_idx = gguf_find_tensor(gguf_ctx, name); offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx); @@ -4653,6 +4655,10 @@ struct llama_model_loader { throw std::runtime_error(format("tensor '%s' data is not within the file bounds, model is corrupted or incomplete", name)); } } + + void set_as_needed() const { + is_needed = true; + } }; std::vector weights; @@ -5160,6 +5166,9 @@ struct llama_model_loader { return NULL; } + auto * weight = get_weight(ggml_get_name(cur)); + weight->set_as_needed(); // this tensor is needed for this device + return create_tensor_for(ctx, cur, flags & TENSOR_DUPLICATED); } @@ -5239,39 +5248,48 @@ struct llama_model_loader { } } - void get_mapping_ranges(std::vector>& buffer_ranges, void ** addr, int idx, ggml_context * ctx) const { + void get_mapping_ranges(std::vector>& ranges, void ** addr, int idx, ggml_context * ctx, ggml_context * cpu_ctx) const { GGML_ASSERT(!mappings.empty()); const auto & mapping = mappings.at(idx); *addr = mapping->addr; - for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) { - try { - const llama_tensor_weight * weight = get_weight(ggml_get_name(tensor)); - if (!weight || weight->idx != idx) continue; + auto merge_tensor_range = [&](ggml_context * context) { + for (ggml_tensor * tensor = ggml_get_first_tensor(context); tensor; tensor = ggml_get_next_tensor(context, tensor)) { + try { + const llama_tensor_weight* weight = get_weight(ggml_get_name(tensor)); + if (!weight || weight->idx != idx) continue; - size_t tensor_first = weight->offs; - size_t tensor_last = tensor_first + ggml_nbytes(tensor); + size_t first = weight->offs; + size_t last = first + ggml_nbytes(tensor); - auto it = std::lower_bound( - buffer_ranges.begin(), buffer_ranges.end(), std::make_pair(tensor_first, tensor_last), - [](const std::pair& a, const std::pair& b) { - return a.first < b.first; + auto it = std::lower_bound( + ranges.begin(), ranges.end(), std::make_pair(first, last), + [](const std::pair& a, const std::pair& b) { + return a.first < b.first; + } + ); + + if (it != ranges.begin() && (it - 1)->second >= first) { + --it; + it->second = std::max(it->second, last); + } else { + it = ranges.insert(it, {first, last}); } - ); - if (it != buffer_ranges.begin() && (it - 1)->second >= tensor_first) { - --it; - it->second = std::max(it->second, tensor_last); - } else { - it = buffer_ranges.insert(it, {tensor_first, tensor_last}); + while (it + 1 != ranges.end() && (it + 1)->first <= it->second) { + it->second = std::max(it->second, (it + 1)->second); + ranges.erase(it + 1); + } + } catch (...) { + // Ignore errors for tensors not in the model } - - while (it + 1 != buffer_ranges.end() && (it + 1)->first <= it->second) { - it->second = std::max(it->second, (it + 1)->second); - buffer_ranges.erase(it + 1); - } - } catch (...) { } + }; + + merge_tensor_range(ctx); + + if (cpu_ctx != ctx && cpu_ctx != nullptr) { + merge_tensor_range(cpu_ctx); } } @@ -5306,7 +5324,8 @@ struct llama_model_loader { // Returns false if cancelled by progress_callback bool load_all_data( struct ggml_context * ctx, - llama_buf_map & bufs, + llama_buf_map & buffers, + llama_buf_range & buffer_ranges, llama_mlocks * lmlocks, llama_progress_callback progress_callback, void * progress_callback_user_data) { @@ -5330,7 +5349,14 @@ struct llama_model_loader { } // When not using mmaped io use async uploads from pinned memory to GPU memory. // First determine if the backend supports the necessary features for async uploads. - auto * buf = bufs.count(0) ? bufs.at(0) : nullptr; + auto * buf = [&]() -> ggml_backend_buffer_t { // todo: check + auto range = buffers.equal_range(0); + if (range.first != range.second) { + return range.first->second; + } + return nullptr; + }(); + if (!buf) { LLAMA_LOG_DEBUG("%s: no buffer found for async uploads\n", fn); return nullptr; @@ -5399,16 +5425,25 @@ struct llama_model_loader { if (upload_backend) { throw std::runtime_error("async uploads is not supported now\n"); + + ggml_backend_buffer_t buf = [&]() -> ggml_backend_buffer_t { + auto range = buffers.equal_range(0); + if (range.first != range.second) { + return range.first->second; + } + return nullptr; + }(); + LLAMA_LOG_DEBUG("%s: using async uploads for device %s, buffer type %s, backend %s\n", __func__, ggml_backend_dev_name(ggml_backend_get_device(upload_backend)), - ggml_backend_buft_name(ggml_backend_buffer_get_type(bufs.at(0))), + ggml_backend_buft_name(ggml_backend_buffer_get_type(buf)), // todo: check ggml_backend_name(upload_backend)); } for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) { const auto * weight = get_weight(ggml_get_name(cur)); - if (weight == nullptr) { - // this can happen with split experts models + if (weight == nullptr || !weight->is_needed) { + // this can happen with split experts models or this weight is not handled by this device continue; } @@ -5422,18 +5457,29 @@ struct llama_model_loader { if (use_mmap) { const auto & mapping = mappings.at(weight->idx); - ggml_backend_buffer_t buf_mmap = nullptr; - if (bufs.count(weight->idx)) { - buf_mmap = bufs.at(weight->idx); - } uint8_t * data = (uint8_t *) mapping->addr + weight->offs; - if (check_tensors) { validation_result.emplace_back(std::async(std::launch::async, [cur, data, n_size] { return std::make_pair(cur, ggml_validate_row_data(cur->type, data, n_size)); })); } + // find the buffer map allocated for the tensor + ggml_backend_buffer_t buf_mmap = nullptr; + auto bufs = buffers.equal_range(weight->idx); + auto ranges = buffer_ranges[ctx][weight->idx]; + + for (size_t i = 0; i < ranges.size(); ++i) { + size_t first = ranges[i].first; + size_t last = ranges[i].second; + if (weight->offs >= first && weight->offs + n_size <= last) { + auto it = bufs.first; + std::advance(it, i); + buf_mmap = it->second; + break; + } + } + GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated if (buf_mmap && cur->data == nullptr) { ggml_backend_tensor_alloc(buf_mmap, cur, data); @@ -5442,9 +5488,10 @@ struct llama_model_loader { lmlock->grow_to(weight->offs + n_size); } - auto & mmap_used = mmaps_used[weight->idx]; - mmap_used.first = std::min(mmap_used.first, weight->offs); - mmap_used.second = std::max(mmap_used.second, weight->offs + n_size); + // NOTE: mmap_used is replaced by buffer_ranges + // auto & mmap_used = mmaps_used[weight->idx]; + // mmap_used.first = std::min(mmap_used.first, weight->offs); + // mmap_used.second = std::max(mmap_used.second, weight->offs + n_size); } else { ggml_backend_tensor_set(cur, data, 0, n_size); } @@ -5521,12 +5568,34 @@ struct llama_model_loader { // unmap offloaded tensors and metadata if (use_mmap) { for (uint32_t idx = 0; idx < mappings.size(); idx++) { - const auto & mmap_used = mmaps_used.at(idx); auto & mapping = mappings.at(idx); - mapping->unmap_fragment(0, mmap_used.first); - if (mmap_used.second != 0) { - mapping->unmap_fragment(mmap_used.second, mapping->size); + auto & ranges_used = buffer_ranges[ctx][idx]; + + std::sort(ranges_used.begin(), ranges_used.end(), []( + const std::pair & a, const std::pair & b) { + return a.first < b.first; + }); + + size_t prev_end = 0; + for (const auto & range : ranges_used) { + size_t first = range.first; + size_t last = range.second; + if (first > prev_end) { + mapping->unmap_fragment(prev_end, first); + } + prev_end = last; } + + if (prev_end < mapping->size) { + mapping->unmap_fragment(prev_end, mapping->size); + } + + // NOTE: mmap_used is replaced by buffer_ranges + // const auto & mmap_used = mmaps_used.at(idx); + // mapping->unmap_fragment(0, mmap_used.first); + // if (mmap_used.second != 0) { + // mapping->unmap_fragment(mmap_used.second, mapping->size); + // } } } if (progress_callback) { @@ -9175,6 +9244,14 @@ static bool llm_load_tensors_impl( } } + // erase weights that are not needed + ml.weights.erase( + std::remove_if(ml.weights.begin(), ml.weights.end(), [](const llama_model_loader::llama_tensor_weight & weight) { + return !weight.is_needed; + }), + ml.weights.end() + ); + ml.init_mappings(false, use_mlock ? &model.mlock_mmaps : nullptr); model.mappings.reserve(ml.mappings.size()); @@ -9182,16 +9259,23 @@ static bool llm_load_tensors_impl( std::vector> ctx_bufs; ctx_bufs.reserve(ctx_map.size()); - // Ensure we have enough capacity for the maximum backend buffer we will potentially create + // ensure we have enough capacity for the maximum backend buffer we will potentially create size_t n_max_backend_buffer = ctx_map.size() * ml.files.size(); model.bufs.reserve(n_max_backend_buffer); + // use the last context (the CPU context) to allocate Metal buffer for input/output tensors + ggml_context * cpu_ctx = nullptr; + if (my_rank == 0) { + cpu_ctx = std::prev(ctx_map.end())->second; + } + + llama_buf_range buffer_ranges; + for (auto & it : ctx_map) { ggml_backend_buffer_type_t buft = it.first; ggml_context * ctx = it.second; llama_buf_map bufs; - bufs.reserve(n_max_backend_buffer); // check if this backend device supports buffer_from_host_ptr // when using a host buffer as the CPU bakcend buffer, use the CPU device to prioritize using buffer_from_host_ptr over the host buffer @@ -9208,24 +9292,34 @@ static bool llm_load_tensors_impl( buffer_from_host_ptr_supported = props.caps.buffer_from_host_ptr; } + auto & ctx_buffer_ranges = buffer_ranges[ctx]; + if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported) { for (uint32_t idx = 0; idx < ml.files.size(); idx++) { + const size_t max_size = ggml_get_max_tensor_size(ctx); + // only the mmap region containing the tensors in the model is mapped to the backend buffer // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size void * addr = nullptr; - size_t first, last; // NOLINT - ml.get_mapping_range(&first, &last, &addr, idx, ctx); - if (first >= last) { - continue; + auto & ranges = ctx_buffer_ranges[idx]; + + ml.get_mapping_ranges(ranges, &addr, idx, ctx, cpu_ctx); + + for (const auto & range : ranges) { + size_t first = range.first; + size_t last = range.second; + + if (first >= last) continue; + + ggml_backend_buffer_t buf = ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size); + if (buf == nullptr) { + throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft))); + } + + model.bufs.push_back(buf); + bufs.emplace(idx, buf); } - const size_t max_size = ggml_get_max_tensor_size(ctx); - ggml_backend_buffer_t buf = ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size); - if (buf == nullptr) { - throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft))); - } - model.bufs.push_back(buf); - bufs.emplace(idx, buf); } } else { ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); @@ -9268,7 +9362,7 @@ static bool llm_load_tensors_impl( for (auto & it : ctx_bufs) { ggml_context * ctx = it.first; auto & bufs = it.second; - if (!ml.load_all_data(ctx, bufs, use_mlock ? &model.mlock_mmaps : NULL, progress_callback, progress_callback_user_data)) { + if (!ml.load_all_data(ctx, bufs, buffer_ranges, use_mlock ? &model.mlock_mmaps : NULL, progress_callback, progress_callback_user_data)) { return false; } }