segment mmap range on Metal shared memory to avoid memory waste

2025-09-06 20:09:02 +00:00 · 2025-01-21 21:07:02 +04:00 · 2025-01-21 21:07:02 +04:00 · 189ed92cba
commit 189ed92cba
parent 871a27f66a
1 changed files with 152 additions and 58 deletions
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -4603,7 +4603,8 @@ namespace GGUFMeta {
    };
 }

-using llama_buf_map = std::unordered_map<uint32_t, ggml_backend_buffer_t>;
+using llama_buf_map   = std::multimap<uint32_t, ggml_backend_buffer_t>; // <file_idx, buffer_map>
+using llama_buf_range = std::map<ggml_context *, std::map<uint32_t, std::vector<std::pair<size_t, size_t>>>>; // <ggml_context, file_idx, [<first, last>]>

 static size_t llama_model_max_nodes(const llama_model & model) {
    return std::max<size_t>(8192, model.tensors_by_name.size()*5);
@ -4640,12 +4641,13 @@ struct llama_model_loader {

    // Holds information on a model weight
    struct llama_tensor_weight {
-        uint16_t  idx; // source file index
-        size_t   offs; // tensor data offset in the original file
+        uint16_t   idx; // source file index
+        size_t    offs; // tensor data offset in the original file
+        mutable bool is_needed; // whether the tensor is needed for this device

        ggml_tensor * tensor;

-        llama_tensor_weight(const llama_file * file, uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
+        llama_tensor_weight(const llama_file * file, uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), is_needed(false), tensor(tensor) {
            const int tensor_idx = gguf_find_tensor(gguf_ctx, name);
            offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx);

@ -4653,6 +4655,10 @@ struct llama_model_loader {
                throw std::runtime_error(format("tensor '%s' data is not within the file bounds, model is corrupted or incomplete", name));
            }
        }
+
+        void set_as_needed() const {
+            is_needed = true;
+        }
    };
    std::vector<llama_tensor_weight> weights;

@ -5160,6 +5166,9 @@ struct llama_model_loader {
            return NULL;
        }

+        auto * weight = get_weight(ggml_get_name(cur));
+        weight->set_as_needed(); // this tensor is needed for this device
+
        return create_tensor_for(ctx, cur, flags & TENSOR_DUPLICATED);
    }

@ -5239,39 +5248,48 @@ struct llama_model_loader {
        }
    }

-    void get_mapping_ranges(std::vector<std::pair<size_t, size_t>>& buffer_ranges, void ** addr, int idx, ggml_context * ctx) const {
+    void get_mapping_ranges(std::vector<std::pair<size_t, size_t>>& ranges, void ** addr, int idx, ggml_context * ctx, ggml_context * cpu_ctx) const {
        GGML_ASSERT(!mappings.empty());
        const auto & mapping = mappings.at(idx);
        *addr = mapping->addr;

-        for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) {
-            try {
-                const llama_tensor_weight * weight = get_weight(ggml_get_name(tensor));
-                if (!weight || weight->idx != idx) continue;
+        auto merge_tensor_range = [&](ggml_context * context) {
+            for (ggml_tensor * tensor = ggml_get_first_tensor(context); tensor; tensor = ggml_get_next_tensor(context, tensor)) {
+                try {
+                    const llama_tensor_weight* weight = get_weight(ggml_get_name(tensor));
+                    if (!weight || weight->idx != idx) continue;

-                size_t tensor_first = weight->offs;
-                size_t tensor_last  = tensor_first + ggml_nbytes(tensor);
+                    size_t first = weight->offs;
+                    size_t last = first + ggml_nbytes(tensor);

-                auto it = std::lower_bound(
-                    buffer_ranges.begin(), buffer_ranges.end(), std::make_pair(tensor_first, tensor_last),
-                    [](const std::pair<size_t, size_t>& a, const std::pair<size_t, size_t>& b) {
-                        return a.first < b.first;
+                    auto it = std::lower_bound(
+                        ranges.begin(), ranges.end(), std::make_pair(first, last),
+                        [](const std::pair<size_t, size_t>& a, const std::pair<size_t, size_t>& b) {
+                            return a.first < b.first;
+                        }
+                    );
+
+                    if (it != ranges.begin() && (it - 1)->second >= first) {
+                        --it;
+                        it->second = std::max(it->second, last);
+                    } else {
+                        it = ranges.insert(it, {first, last});
                    }
-                );

-                if (it != buffer_ranges.begin() && (it - 1)->second >= tensor_first) {
-                    --it;
-                    it->second = std::max(it->second, tensor_last);
-                } else {
-                    it = buffer_ranges.insert(it, {tensor_first, tensor_last});
+                    while (it + 1 != ranges.end() && (it + 1)->first <= it->second) {
+                        it->second = std::max(it->second, (it + 1)->second);
+                        ranges.erase(it + 1);
+                    }
+                } catch (...) {
+                    // Ignore errors for tensors not in the model
                }
-
-                while (it + 1 != buffer_ranges.end() && (it + 1)->first <= it->second) {
-                    it->second = std::max(it->second, (it + 1)->second);
-                    buffer_ranges.erase(it + 1);
-                }
-            } catch (...) {
            }
+        };
+
+        merge_tensor_range(ctx);
+
+        if (cpu_ctx != ctx && cpu_ctx != nullptr) {
+            merge_tensor_range(cpu_ctx);
        }
    }

@ -5306,7 +5324,8 @@ struct llama_model_loader {
    // Returns false if cancelled by progress_callback
    bool load_all_data(
            struct ggml_context   * ctx,
-            llama_buf_map         & bufs,
+            llama_buf_map         & buffers,
+            llama_buf_range       & buffer_ranges,
            llama_mlocks          * lmlocks,
            llama_progress_callback progress_callback,
            void                  * progress_callback_user_data) {
@ -5330,7 +5349,14 @@ struct llama_model_loader {
            }
            // When not using mmaped io use async uploads from pinned memory to GPU memory.
            // First determine if the backend supports the necessary features for async uploads.
-            auto * buf = bufs.count(0) ? bufs.at(0) : nullptr;
+            auto * buf = [&]() -> ggml_backend_buffer_t { // todo: check
+                auto range = buffers.equal_range(0);
+                if (range.first != range.second) { 
+                    return range.first->second;
+                }
+                return nullptr;
+            }();
+
            if (!buf) {
                LLAMA_LOG_DEBUG("%s: no buffer found for async uploads\n", fn);
                return nullptr;
@ -5399,16 +5425,25 @@ struct llama_model_loader {

        if (upload_backend) {
            throw std::runtime_error("async uploads is not supported now\n");
+
+            ggml_backend_buffer_t buf = [&]() -> ggml_backend_buffer_t {
+                auto range = buffers.equal_range(0); 
+                if (range.first != range.second) { 
+                    return range.first->second; 
+                }
+                return nullptr; 
+            }();
+
            LLAMA_LOG_DEBUG("%s: using async uploads for device %s, buffer type %s, backend %s\n", __func__,
                ggml_backend_dev_name(ggml_backend_get_device(upload_backend)),
-                ggml_backend_buft_name(ggml_backend_buffer_get_type(bufs.at(0))),
+                ggml_backend_buft_name(ggml_backend_buffer_get_type(buf)), // todo: check
                ggml_backend_name(upload_backend));
        }

        for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
            const auto * weight = get_weight(ggml_get_name(cur));
-            if (weight == nullptr) {
-                // this can happen with split experts models
+            if (weight == nullptr || !weight->is_needed) {
+                // this can happen with split experts models or this weight is not handled by this device
                continue;
            }

@ -5422,18 +5457,29 @@ struct llama_model_loader {

            if (use_mmap) {
                const auto & mapping = mappings.at(weight->idx);
-                ggml_backend_buffer_t buf_mmap = nullptr;
-                if (bufs.count(weight->idx)) {
-                    buf_mmap = bufs.at(weight->idx);
-                }
                uint8_t * data = (uint8_t *) mapping->addr + weight->offs;
-
                if (check_tensors) {
                    validation_result.emplace_back(std::async(std::launch::async, [cur, data, n_size] {
                        return std::make_pair(cur, ggml_validate_row_data(cur->type, data, n_size));
                    }));
                }

+                // find the buffer map allocated for the tensor
+                ggml_backend_buffer_t buf_mmap = nullptr;
+                auto bufs = buffers.equal_range(weight->idx);
+                auto ranges = buffer_ranges[ctx][weight->idx];
+
+                for (size_t i = 0; i < ranges.size(); ++i) {
+                    size_t first = ranges[i].first;
+                    size_t last  = ranges[i].second;
+                    if (weight->offs >= first && weight->offs + n_size <= last) {
+                        auto it = bufs.first;
+                        std::advance(it, i);
+                        buf_mmap = it->second;
+                        break;
+                    }
+                }
+
                GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated
                if (buf_mmap && cur->data == nullptr) {
                    ggml_backend_tensor_alloc(buf_mmap, cur, data);
@ -5442,9 +5488,10 @@ struct llama_model_loader {
                        lmlock->grow_to(weight->offs + n_size);
                    }

-                    auto & mmap_used = mmaps_used[weight->idx];
-                    mmap_used.first  = std::min(mmap_used.first,  weight->offs);
-                    mmap_used.second = std::max(mmap_used.second, weight->offs + n_size);
+                    // NOTE: mmap_used is replaced by buffer_ranges
+                    // auto & mmap_used = mmaps_used[weight->idx];
+                    // mmap_used.first  = std::min(mmap_used.first,  weight->offs);
+                    // mmap_used.second = std::max(mmap_used.second, weight->offs + n_size);
                } else {
                    ggml_backend_tensor_set(cur, data, 0, n_size);
                }
@ -5521,12 +5568,34 @@ struct llama_model_loader {
            // unmap offloaded tensors and metadata
            if (use_mmap) {
                for (uint32_t idx = 0; idx < mappings.size(); idx++) {
-                    const auto & mmap_used = mmaps_used.at(idx);
                    auto & mapping = mappings.at(idx);
-                    mapping->unmap_fragment(0, mmap_used.first);
-                    if (mmap_used.second != 0) {
-                        mapping->unmap_fragment(mmap_used.second, mapping->size);
+                    auto & ranges_used = buffer_ranges[ctx][idx];
+
+                    std::sort(ranges_used.begin(), ranges_used.end(), [](
+                        const std::pair<size_t, size_t> & a, const std::pair<size_t, size_t> & b) {
+                            return a.first < b.first;
+                    });
+
+                    size_t prev_end = 0;
+                    for (const auto & range : ranges_used) {
+                        size_t first = range.first;
+                        size_t last = range.second;
+                        if (first > prev_end) {
+                            mapping->unmap_fragment(prev_end, first);
+                        }
+                        prev_end = last;
                    }
+
+                    if (prev_end < mapping->size) {
+                        mapping->unmap_fragment(prev_end, mapping->size);
+                    }
+                    
+                    // NOTE: mmap_used is replaced by buffer_ranges
+                    // const auto & mmap_used = mmaps_used.at(idx);
+                    // mapping->unmap_fragment(0, mmap_used.first);
+                    // if (mmap_used.second != 0) {
+                    //     mapping->unmap_fragment(mmap_used.second, mapping->size);
+                    // }
                }
            }
            if (progress_callback) {
@ -9175,6 +9244,14 @@ static bool llm_load_tensors_impl(
        }
    }

+    // erase weights that are not needed
+    ml.weights.erase(
+        std::remove_if(ml.weights.begin(), ml.weights.end(), [](const llama_model_loader::llama_tensor_weight & weight) {
+            return !weight.is_needed;
+        }),
+        ml.weights.end()
+    );
+
    ml.init_mappings(false, use_mlock ? &model.mlock_mmaps : nullptr);
    model.mappings.reserve(ml.mappings.size());

@ -9182,16 +9259,23 @@ static bool llm_load_tensors_impl(
    std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_bufs;
    ctx_bufs.reserve(ctx_map.size());

-    // Ensure we have enough capacity for the maximum backend buffer we will potentially create
+    // ensure we have enough capacity for the maximum backend buffer we will potentially create
    size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();
    model.bufs.reserve(n_max_backend_buffer);

+    // use the last context (the CPU context) to allocate Metal buffer for input/output tensors
+    ggml_context * cpu_ctx = nullptr;
+    if (my_rank == 0) {
+        cpu_ctx = std::prev(ctx_map.end())->second;
+    }
+
+    llama_buf_range buffer_ranges;
+
    for (auto & it : ctx_map) {
        ggml_backend_buffer_type_t buft = it.first;
        ggml_context * ctx              = it.second;

        llama_buf_map bufs;
-        bufs.reserve(n_max_backend_buffer);

        // check if this backend device supports buffer_from_host_ptr
        // when using a host buffer as the CPU bakcend buffer, use the CPU device to prioritize using buffer_from_host_ptr over the host buffer
@ -9208,24 +9292,34 @@ static bool llm_load_tensors_impl(
            buffer_from_host_ptr_supported = props.caps.buffer_from_host_ptr;
        }

+        auto & ctx_buffer_ranges = buffer_ranges[ctx];
+
        if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported) {
            for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
+                const size_t max_size = ggml_get_max_tensor_size(ctx);
+
                // only the mmap region containing the tensors in the model is mapped to the backend buffer
                // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
                // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
                void * addr = nullptr;
-                size_t first, last; // NOLINT
-                ml.get_mapping_range(&first, &last, &addr, idx, ctx);
-                if (first >= last) {
-                    continue;
+                auto & ranges = ctx_buffer_ranges[idx]; 
+
+                ml.get_mapping_ranges(ranges, &addr, idx, ctx, cpu_ctx);
+
+                for (const auto & range : ranges) {
+                    size_t first = range.first;
+                    size_t last  = range.second;
+
+                    if (first >= last) continue;
+
+                    ggml_backend_buffer_t buf = ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size);
+                    if (buf == nullptr) {
+                        throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
+                    }
+
+                    model.bufs.push_back(buf);
+                    bufs.emplace(idx, buf);
                }
-                const size_t max_size = ggml_get_max_tensor_size(ctx);
-                ggml_backend_buffer_t buf = ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size);
-                if (buf == nullptr) {
-                    throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
-                }
-                model.bufs.push_back(buf);
-                bufs.emplace(idx, buf);
            }
        } else {
            ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
@ -9268,7 +9362,7 @@ static bool llm_load_tensors_impl(
    for (auto & it : ctx_bufs) {
        ggml_context * ctx  = it.first;
        auto         & bufs = it.second;
-        if (!ml.load_all_data(ctx, bufs, use_mlock ? &model.mlock_mmaps : NULL, progress_callback, progress_callback_user_data)) {
+        if (!ml.load_all_data(ctx, bufs, buffer_ranges, use_mlock ? &model.mlock_mmaps : NULL, progress_callback, progress_callback_user_data)) {
            return false;
        }
    }