From ce2ef9699f88344f669d52232c79e83d4c2e9b6c Mon Sep 17 00:00:00 2001
From: Lizonghang <870644199@qq.com>
Date: Tue, 21 Jan 2025 22:43:51 +0400
Subject: [PATCH] fix mapping unmap_fragment error

---
 src/llama.cpp | 175 +++++++++++++++++++++++++++-----------------------
 1 file changed, 93 insertions(+), 82 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index d2e6e664..287479e4 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -5324,6 +5324,7 @@ struct llama_model_loader {
     // Returns false if cancelled by progress_callback
     bool load_all_data(
             struct ggml_context   * ctx,
+            struct ggml_context   * cpu_ctx,
             llama_buf_map         & buffers,
             llama_buf_range       & buffer_ranges,
             llama_mlocks          * lmlocks,
@@ -5440,104 +5441,111 @@ struct llama_model_loader {
                 ggml_backend_name(upload_backend));
         }
 
-        for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
-            const auto * weight = get_weight(ggml_get_name(cur));
-            if (weight == nullptr || !weight->is_needed) {
-                // this can happen with split experts models or this weight is not handled by this device
-                continue;
-            }
+        std::vector<ggml_context *> merged_ctxs = {ctx};
+        if (cpu_ctx != ctx && cpu_ctx != nullptr) {
+            merged_ctxs.push_back(cpu_ctx);
+        }
 
-            if (progress_callback) {
-                if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
-                    return false;
-                }
-            }
-
-            size_t n_size = ggml_nbytes(cur);
-
-            if (use_mmap) {
-                const auto & mapping = mappings.at(weight->idx);
-                uint8_t * data = (uint8_t *) mapping->addr + weight->offs;
-                if (check_tensors) {
-                    validation_result.emplace_back(std::async(std::launch::async, [cur, data, n_size] {
-                        return std::make_pair(cur, ggml_validate_row_data(cur->type, data, n_size));
-                    }));
+        for (ggml_context * ctx0 : merged_ctxs) {
+            for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx0); cur != NULL; cur = ggml_get_next_tensor(ctx0, cur)) {
+                const auto * weight = get_weight(ggml_get_name(cur));
+                if (weight == nullptr || !weight->is_needed) {
+                    // this can happen with split experts models or this weight is not handled by this device
+                    continue;
                 }
 
-                // find the buffer map allocated for the tensor
-                ggml_backend_buffer_t buf_mmap = nullptr;
-                auto bufs = buffers.equal_range(weight->idx);
-                auto ranges = buffer_ranges[ctx][weight->idx];
-
-                for (size_t i = 0; i < ranges.size(); ++i) {
-                    size_t first = ranges[i].first;
-                    size_t last  = ranges[i].second;
-                    if (weight->offs >= first && weight->offs + n_size <= last) {
-                        auto it = bufs.first;
-                        std::advance(it, i);
-                        buf_mmap = it->second;
-                        break;
+                if (progress_callback) {
+                    if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
+                        return false;
                     }
                 }
 
-                GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated
-                if (buf_mmap && cur->data == nullptr) {
-                    ggml_backend_tensor_alloc(buf_mmap, cur, data);
-                    if (lmlocks) {
-                        const auto & lmlock = lmlocks->at(weight->idx);
-                        lmlock->grow_to(weight->offs + n_size);
-                    }
+                size_t n_size = ggml_nbytes(cur);
 
-                    // NOTE: mmap_used is replaced by buffer_ranges
-                    // auto & mmap_used = mmaps_used[weight->idx];
-                    // mmap_used.first  = std::min(mmap_used.first,  weight->offs);
-                    // mmap_used.second = std::max(mmap_used.second, weight->offs + n_size);
-                } else {
-                    ggml_backend_tensor_set(cur, data, 0, n_size);
-                }
-            } else {
-                GGML_ASSERT(weight->idx < files.size());
-                const auto & file = files.at(weight->idx);
-                if (ggml_backend_buffer_is_host(cur->buffer)) {
-                    file->seek(weight->offs, SEEK_SET);
-                    file->read_raw(cur->data, n_size);
+                if (use_mmap) {
+                    const auto & mapping = mappings.at(weight->idx);
+                    uint8_t * data = (uint8_t *) mapping->addr + weight->offs;
                     if (check_tensors) {
-                        validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] {
-                            return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size));
+                        validation_result.emplace_back(std::async(std::launch::async, [cur, data, n_size] {
+                            return std::make_pair(cur, ggml_validate_row_data(cur->type, data, n_size));
                         }));
                     }
+
+                    // find the buffer map allocated for the tensor
+                    ggml_backend_buffer_t buf_mmap = nullptr;
+                    auto bufs = buffers.equal_range(weight->idx);
+                    auto ranges = buffer_ranges[ctx][weight->idx];
+
+                    for (size_t i = 0; i < ranges.size(); ++i) {
+                        size_t first = ranges[i].first;
+                        size_t last  = ranges[i].second;
+                        if (weight->offs >= first && weight->offs + n_size <= last) {
+                            auto it = bufs.first;
+                            std::advance(it, i);
+                            buf_mmap = it->second;
+                            break;
+                        }
+                    }
+
+                    GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated
+                    if (buf_mmap && cur->data == nullptr) {
+                        ggml_backend_tensor_alloc(buf_mmap, cur, data);
+                        if (lmlocks) {
+                            const auto & lmlock = lmlocks->at(weight->idx);
+                            lmlock->grow_to(weight->offs + n_size);
+                        }
+
+                        // NOTE: mmap_used is replaced by buffer_ranges
+                        // auto & mmap_used = mmaps_used[weight->idx];
+                        // mmap_used.first  = std::min(mmap_used.first,  weight->offs);
+                        // mmap_used.second = std::max(mmap_used.second, weight->offs + n_size);
+                    } else {
+                        ggml_backend_tensor_set(cur, data, 0, n_size);
+                    }
                 } else {
-                    // If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU.
-                    if (upload_backend) {
+                    GGML_ASSERT(weight->idx < files.size());
+                    const auto & file = files.at(weight->idx);
+                    if (ggml_backend_buffer_is_host(cur->buffer)) {
                         file->seek(weight->offs, SEEK_SET);
-
-                        size_t bytes_read = 0;
-
-                        while (bytes_read < n_size) {
-                            size_t read_iteration = std::min<size_t>(buffer_size, n_size - bytes_read);
-
-                            ggml_backend_event_synchronize(events[buffer_idx]);
-                            file->read_raw(host_ptrs[buffer_idx], read_iteration);
-                            ggml_backend_tensor_set_async(upload_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration);
-                            ggml_backend_event_record(events[buffer_idx], upload_backend);
-
-                            bytes_read += read_iteration;
-                            ++buffer_idx;
-                            buffer_idx %= n_buffers;
+                        file->read_raw(cur->data, n_size);
+                        if (check_tensors) {
+                            validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] {
+                                return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size));
+                            }));
                         }
                     } else {
-                        read_buf.resize(n_size);
-                        file->seek(weight->offs, SEEK_SET);
-                        file->read_raw(read_buf.data(), n_size);
-                        ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
-                        if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
-                            throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
+                        // If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU.
+                        if (upload_backend) {
+                            file->seek(weight->offs, SEEK_SET);
+
+                            size_t bytes_read = 0;
+
+                            while (bytes_read < n_size) {
+                                size_t read_iteration = std::min<size_t>(buffer_size, n_size - bytes_read);
+
+                                ggml_backend_event_synchronize(events[buffer_idx]);
+                                file->read_raw(host_ptrs[buffer_idx], read_iteration);
+                                ggml_backend_tensor_set_async(upload_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration);
+                                ggml_backend_event_record(events[buffer_idx], upload_backend);
+
+                                bytes_read += read_iteration;
+                                ++buffer_idx;
+                                buffer_idx %= n_buffers;
+                            }
+                        } else {
+                            read_buf.resize(n_size);
+                            file->seek(weight->offs, SEEK_SET);
+                            file->read_raw(read_buf.data(), n_size);
+                            ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
+                            if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
+                                throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
+                            }
                         }
                     }
                 }
-            }
 
-            size_done += n_size;
+                size_done += n_size;
+            }
         }
 
         // free temporary resources used for async uploads
@@ -9266,7 +9274,10 @@ static bool llm_load_tensors_impl(
     // use the last context (the CPU context) to allocate Metal buffer for input/output tensors
     ggml_context * cpu_ctx = nullptr;
     if (my_rank == 0) {
-        cpu_ctx = std::prev(ctx_map.end())->second;
+        auto last_it = std::prev(ctx_map.end());
+        if (last_it->first == ggml_backend_cpu_buffer_type()) {
+            cpu_ctx = last_it->second;
+        }
     }
 
     llama_buf_range buffer_ranges;
@@ -9362,7 +9373,7 @@ static bool llm_load_tensors_impl(
     for (auto & it : ctx_bufs) {
         ggml_context * ctx  = it.first;
         auto         & bufs = it.second;
-        if (!ml.load_all_data(ctx, bufs, buffer_ranges, use_mlock ? &model.mlock_mmaps : NULL, progress_callback, progress_callback_user_data)) {
+        if (!ml.load_all_data(ctx, cpu_ctx, bufs, buffer_ranges, use_mlock ? &model.mlock_mmaps : NULL, progress_callback, progress_callback_user_data)) {
             return false;
         }
     }