add option --keep-inp-out-in-metal and fix bugs in unmap

2025-09-08 03:19:05 +00:00 · 2025-01-22 11:15:19 +04:00 · 2025-01-22 11:15:19 +04:00 · facb4ea736
commit facb4ea736
parent ce2ef9699f
5 changed files with 142 additions and 131 deletions
--- a/common/arg.cpp
+++ b/common/arg.cpp
@ -737,6 +737,14 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
            params.cuda_mem = value; // in GiB
        }
    ).set_env("LLAMA_ARG_CUDA_MEM"));
+    // "--keep-inp-out-in-metal" is a temporary option to keep the input and output in metal
+    add_opt(llama_arg(
+        {"--keep-inp-out-in-metal"},
+        format("whether to keep input and output weight in metal (default: %s)", params.keep_inp_out_in_metal ? "true" : "false"),
+        [](gpt_params & params) {
+            params.keep_inp_out_in_metal = true;
+        }
+    ).set_env("LLAMA_ARG_KEEP_INP_OUT_IN_METAL"));
    add_opt(llama_arg(
        {"-n", "--predict", "--n-predict"}, "N",
        format("number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict),
--- a/common/common.cpp
+++ b/common/common.cpp
@ -1603,6 +1603,7 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
    mparams.use_mmap        = params.use_mmap;
    mparams.use_mlock       = params.use_mlock;
    mparams.check_tensors   = params.check_tensors;
+    mparams.keep_inp_out_in_metal = params.keep_inp_out_in_metal;
    std::copy(std::begin(params.n_layer_window), std::end(params.n_layer_window), mparams.n_layer_window);
    if (params.kv_overrides.empty()) {
        mparams.kv_overrides = NULL;
--- a/common/common.h
+++ b/common/common.h
@ -148,6 +148,7 @@ struct gpt_params {
    std::string master_ip         = "localhost"; // ip address of the master node
    std::string next_node_ip      = "localhost"; // ip address of my next node
    bool    unload                = false; // unload layer weights after use or not
+    bool    keep_inp_out_in_metal = false; // whether to keep input/output weight in metal, not by default
    int32_t cuda_mem              = 999.0; // cuda memory to use, in GiB
    int32_t n_predict             =    -1; // new tokens to predict
    int32_t n_ctx                 =     0; // context size
--- a/include/llama.h
+++ b/include/llama.h
@ -312,6 +312,7 @@ extern "C" {
        bool use_mmap;      // use mmap if possible
        bool use_mlock;     // force system to keep model in RAM
        bool check_tensors; // validate model tensor data
+        bool keep_inp_out_in_metal; // whether to keep input/output weight in metal
    };

    // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -5324,7 +5324,6 @@ struct llama_model_loader {
    // Returns false if cancelled by progress_callback
    bool load_all_data(
            struct ggml_context   * ctx,
-            struct ggml_context   * cpu_ctx,
            llama_buf_map         & buffers,
            llama_buf_range       & buffer_ranges,
            llama_mlocks          * lmlocks,
@ -5441,111 +5440,104 @@ struct llama_model_loader {
                ggml_backend_name(upload_backend));
        }

-        std::vector<ggml_context *> merged_ctxs = {ctx};
-        if (cpu_ctx != ctx && cpu_ctx != nullptr) {
-            merged_ctxs.push_back(cpu_ctx);
-        }
+        for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
+            const auto * weight = get_weight(ggml_get_name(cur));
+            if (weight == nullptr || !weight->is_needed) {
+                // this can happen with split experts models or this weight is not handled by this device
+                continue;
+            }

-        for (ggml_context * ctx0 : merged_ctxs) {
-            for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx0); cur != NULL; cur = ggml_get_next_tensor(ctx0, cur)) {
-                const auto * weight = get_weight(ggml_get_name(cur));
-                if (weight == nullptr || !weight->is_needed) {
-                    // this can happen with split experts models or this weight is not handled by this device
-                    continue;
+            if (progress_callback) {
+                if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
+                    return false;
+                }
+            }
+
+            size_t n_size = ggml_nbytes(cur);
+
+            if (use_mmap) {
+                const auto & mapping = mappings.at(weight->idx);
+                uint8_t * data = (uint8_t *) mapping->addr + weight->offs;
+                if (check_tensors) {
+                    validation_result.emplace_back(std::async(std::launch::async, [cur, data, n_size] {
+                        return std::make_pair(cur, ggml_validate_row_data(cur->type, data, n_size));
+                    }));
                }

-                if (progress_callback) {
-                    if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
-                        return false;
+                // find the buffer map allocated for the tensor
+                ggml_backend_buffer_t buf_mmap = nullptr;
+                auto bufs = buffers.equal_range(weight->idx);
+                auto ranges = buffer_ranges[ctx][weight->idx];
+
+                for (size_t i = 0; i < ranges.size(); ++i) {
+                    size_t first = ranges[i].first;
+                    size_t last  = ranges[i].second;
+                    if (weight->offs >= first && weight->offs + n_size <= last) {
+                        auto it = bufs.first;
+                        std::advance(it, i);
+                        buf_mmap = it->second;
+                        break;
                    }
                }

-                size_t n_size = ggml_nbytes(cur);
+                GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated
+                if (buf_mmap && cur->data == nullptr) {
+                    ggml_backend_tensor_alloc(buf_mmap, cur, data);
+                    if (lmlocks) {
+                        const auto & lmlock = lmlocks->at(weight->idx);
+                        lmlock->grow_to(weight->offs + n_size);
+                    }

-                if (use_mmap) {
-                    const auto & mapping = mappings.at(weight->idx);
-                    uint8_t * data = (uint8_t *) mapping->addr + weight->offs;
+                    // NOTE: mmap_used is replaced by buffer_ranges
+                    // auto & mmap_used = mmaps_used[weight->idx];
+                    // mmap_used.first  = std::min(mmap_used.first,  weight->offs);
+                    // mmap_used.second = std::max(mmap_used.second, weight->offs + n_size);
+                } else {
+                    ggml_backend_tensor_set(cur, data, 0, n_size);
+                }
+            } else {
+                GGML_ASSERT(weight->idx < files.size());
+                const auto & file = files.at(weight->idx);
+                if (ggml_backend_buffer_is_host(cur->buffer)) {
+                    file->seek(weight->offs, SEEK_SET);
+                    file->read_raw(cur->data, n_size);
                    if (check_tensors) {
-                        validation_result.emplace_back(std::async(std::launch::async, [cur, data, n_size] {
-                            return std::make_pair(cur, ggml_validate_row_data(cur->type, data, n_size));
+                        validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] {
+                            return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size));
                        }));
                    }
-
-                    // find the buffer map allocated for the tensor
-                    ggml_backend_buffer_t buf_mmap = nullptr;
-                    auto bufs = buffers.equal_range(weight->idx);
-                    auto ranges = buffer_ranges[ctx][weight->idx];
-
-                    for (size_t i = 0; i < ranges.size(); ++i) {
-                        size_t first = ranges[i].first;
-                        size_t last  = ranges[i].second;
-                        if (weight->offs >= first && weight->offs + n_size <= last) {
-                            auto it = bufs.first;
-                            std::advance(it, i);
-                            buf_mmap = it->second;
-                            break;
-                        }
-                    }
-
-                    GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated
-                    if (buf_mmap && cur->data == nullptr) {
-                        ggml_backend_tensor_alloc(buf_mmap, cur, data);
-                        if (lmlocks) {
-                            const auto & lmlock = lmlocks->at(weight->idx);
-                            lmlock->grow_to(weight->offs + n_size);
-                        }
-
-                        // NOTE: mmap_used is replaced by buffer_ranges
-                        // auto & mmap_used = mmaps_used[weight->idx];
-                        // mmap_used.first  = std::min(mmap_used.first,  weight->offs);
-                        // mmap_used.second = std::max(mmap_used.second, weight->offs + n_size);
-                    } else {
-                        ggml_backend_tensor_set(cur, data, 0, n_size);
-                    }
                } else {
-                    GGML_ASSERT(weight->idx < files.size());
-                    const auto & file = files.at(weight->idx);
-                    if (ggml_backend_buffer_is_host(cur->buffer)) {
+                    // If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU.
+                    if (upload_backend) {
                        file->seek(weight->offs, SEEK_SET);
-                        file->read_raw(cur->data, n_size);
-                        if (check_tensors) {
-                            validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] {
-                                return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size));
-                            }));
+
+                        size_t bytes_read = 0;
+
+                        while (bytes_read < n_size) {
+                            size_t read_iteration = std::min<size_t>(buffer_size, n_size - bytes_read);
+
+                            ggml_backend_event_synchronize(events[buffer_idx]);
+                            file->read_raw(host_ptrs[buffer_idx], read_iteration);
+                            ggml_backend_tensor_set_async(upload_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration);
+                            ggml_backend_event_record(events[buffer_idx], upload_backend);
+
+                            bytes_read += read_iteration;
+                            ++buffer_idx;
+                            buffer_idx %= n_buffers;
                        }
                    } else {
-                        // If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU.
-                        if (upload_backend) {
-                            file->seek(weight->offs, SEEK_SET);
-
-                            size_t bytes_read = 0;
-
-                            while (bytes_read < n_size) {
-                                size_t read_iteration = std::min<size_t>(buffer_size, n_size - bytes_read);
-
-                                ggml_backend_event_synchronize(events[buffer_idx]);
-                                file->read_raw(host_ptrs[buffer_idx], read_iteration);
-                                ggml_backend_tensor_set_async(upload_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration);
-                                ggml_backend_event_record(events[buffer_idx], upload_backend);
-
-                                bytes_read += read_iteration;
-                                ++buffer_idx;
-                                buffer_idx %= n_buffers;
-                            }
-                        } else {
-                            read_buf.resize(n_size);
-                            file->seek(weight->offs, SEEK_SET);
-                            file->read_raw(read_buf.data(), n_size);
-                            ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
-                            if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
-                                throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
-                            }
+                        read_buf.resize(n_size);
+                        file->seek(weight->offs, SEEK_SET);
+                        file->read_raw(read_buf.data(), n_size);
+                        ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
+                        if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
+                            throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
                        }
                    }
                }
-
-                size_done += n_size;
            }
+
+            size_done += n_size;
        }

        // free temporary resources used for async uploads
@ -5571,46 +5563,10 @@ struct llama_model_loader {
            throw std::runtime_error("found tensors with invalid data");
        }

-        // check if this is the last call and do final cleanup
-        if (size_done >= size_data) {
-            // unmap offloaded tensors and metadata
-            if (use_mmap) {
-                for (uint32_t idx = 0; idx < mappings.size(); idx++) {
-                    auto & mapping = mappings.at(idx);
-                    auto & ranges_used = buffer_ranges[ctx][idx];
-
-                    std::sort(ranges_used.begin(), ranges_used.end(), [](
-                        const std::pair<size_t, size_t> & a, const std::pair<size_t, size_t> & b) {
-                            return a.first < b.first;
-                    });
-
-                    size_t prev_end = 0;
-                    for (const auto & range : ranges_used) {
-                        size_t first = range.first;
-                        size_t last = range.second;
-                        if (first > prev_end) {
-                            mapping->unmap_fragment(prev_end, first);
-                        }
-                        prev_end = last;
-                    }
-
-                    if (prev_end < mapping->size) {
-                        mapping->unmap_fragment(prev_end, mapping->size);
-                    }
-                    
-                    // NOTE: mmap_used is replaced by buffer_ranges
-                    // const auto & mmap_used = mmaps_used.at(idx);
-                    // mapping->unmap_fragment(0, mmap_used.first);
-                    // if (mmap_used.second != 0) {
-                    //     mapping->unmap_fragment(mmap_used.second, mapping->size);
-                    // }
-                }
-            }
-            if (progress_callback) {
-                // Even though the model is done loading, we still honor
-                // cancellation since we need to free allocations.
-                return progress_callback(1.0f, progress_callback_user_data);
-            }
+        if (progress_callback) {
+            // Even though the model is done loading, we still honor
+            // cancellation since we need to free allocations.
+            return progress_callback(1.0f, progress_callback_user_data);
        }

        return true;
@ -7430,6 +7386,7 @@ static bool llm_load_tensors_impl(
        enum llama_split_mode   split_mode,
        int                     main_gpu,
        bool                    use_mlock,
+        bool                    keep_inp_out_in_metal,
        llama_progress_callback progress_callback,
        void                  * progress_callback_user_data) {
    auto & hparams = model.hparams;
@ -9315,7 +9272,7 @@ static bool llm_load_tensors_impl(
                void * addr = nullptr;
                auto & ranges = ctx_buffer_ranges[idx]; 

-                ml.get_mapping_ranges(ranges, &addr, idx, ctx, cpu_ctx);
+                ml.get_mapping_ranges(ranges, &addr, idx, ctx, keep_inp_out_in_metal ? cpu_ctx : nullptr);

                for (const auto & range : ranges) {
                    size_t first = range.first;
@ -9373,11 +9330,54 @@ static bool llm_load_tensors_impl(
    for (auto & it : ctx_bufs) {
        ggml_context * ctx  = it.first;
        auto         & bufs = it.second;
-        if (!ml.load_all_data(ctx, cpu_ctx, bufs, buffer_ranges, use_mlock ? &model.mlock_mmaps : NULL, progress_callback, progress_callback_user_data)) {
+        if(!ml.load_all_data(ctx, bufs, buffer_ranges, use_mlock ? &model.mlock_mmaps : NULL, progress_callback, progress_callback_user_data)) {
            return false;
        }
    }

+    // check if this is the last call and do final cleanup
+    if (ml.size_done >= ml.size_data) {
+        // unmap offloaded tensors and metadata
+        if (ml.use_mmap) {
+            for (uint32_t idx = 0; idx < ml.mappings.size(); idx++) {
+                auto & mapping = ml.mappings.at(idx);
+
+                // flatten ranges from all contexts into a vector
+                std::vector<std::pair<size_t, size_t>> ranges_used;
+                for (auto & ctx_buf_ranges : buffer_ranges) {
+                    auto & ranges = ctx_buf_ranges.second[idx];
+                    ranges_used.insert(ranges_used.end(), ranges.begin(), ranges.end());
+                }
+
+                std::sort(ranges_used.begin(), ranges_used.end(), [](
+                    const std::pair<size_t, size_t> & a, const std::pair<size_t, size_t> & b) {
+                        return a.first < b.first;
+                });
+
+                size_t prev_end = 0;
+                for (const auto & range : ranges_used) {
+                    size_t first = range.first;
+                    size_t last = range.second;
+                    if (first > prev_end) {
+                        mapping->unmap_fragment(prev_end, first);
+                    }
+                    prev_end = std::max(prev_end, last);
+                }
+
+                if (prev_end < mapping->size) {
+                    mapping->unmap_fragment(prev_end, mapping->size);
+                }
+                
+                // NOTE: mmap_used is replaced by buffer_ranges
+                // const auto & mmap_used = mmaps_used.at(idx);
+                // mapping->unmap_fragment(0, mmap_used.first);
+                // if (mmap_used.second != 0) {
+                //     mapping->unmap_fragment(mmap_used.second, mapping->size);
+                // }
+            }
+        }
+    }
+
    if (use_mmap_buffer) {
        for (auto & mapping : ml.mappings) {
            model.mappings.emplace_back(std::move(mapping));
@ -9396,7 +9396,7 @@ int llm_load_tensors(
    try {
        if (!llm_load_tensors_impl(
            *ml, *model, params.n_world, params.rank, params.n_layer_window, params.n_gpu_layers, params.split_mode, 
-            params.main_gpu, params.use_mlock, params.progress_callback, params.progress_callback_user_data
+            params.main_gpu, params.use_mlock, params.keep_inp_out_in_metal, params.progress_callback, params.progress_callback_user_data
        )) {
            return -2;
        }