mirror of
https://github.com/Lizonghang/prima.cpp.git
synced 2025-09-08 03:19:05 +00:00
add option --keep-inp-out-in-metal and fix bugs in unmap
This commit is contained in:
parent
ce2ef9699f
commit
facb4ea736
5 changed files with 142 additions and 131 deletions
|
@ -737,6 +737,14 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
|||
params.cuda_mem = value; // in GiB
|
||||
}
|
||||
).set_env("LLAMA_ARG_CUDA_MEM"));
|
||||
// "--keep-inp-out-in-metal" is a temporary option to keep the input and output in metal
|
||||
add_opt(llama_arg(
|
||||
{"--keep-inp-out-in-metal"},
|
||||
format("whether to keep input and output weight in metal (default: %s)", params.keep_inp_out_in_metal ? "true" : "false"),
|
||||
[](gpt_params & params) {
|
||||
params.keep_inp_out_in_metal = true;
|
||||
}
|
||||
).set_env("LLAMA_ARG_KEEP_INP_OUT_IN_METAL"));
|
||||
add_opt(llama_arg(
|
||||
{"-n", "--predict", "--n-predict"}, "N",
|
||||
format("number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict),
|
||||
|
|
|
@ -1603,6 +1603,7 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
|
|||
mparams.use_mmap = params.use_mmap;
|
||||
mparams.use_mlock = params.use_mlock;
|
||||
mparams.check_tensors = params.check_tensors;
|
||||
mparams.keep_inp_out_in_metal = params.keep_inp_out_in_metal;
|
||||
std::copy(std::begin(params.n_layer_window), std::end(params.n_layer_window), mparams.n_layer_window);
|
||||
if (params.kv_overrides.empty()) {
|
||||
mparams.kv_overrides = NULL;
|
||||
|
|
|
@ -148,6 +148,7 @@ struct gpt_params {
|
|||
std::string master_ip = "localhost"; // ip address of the master node
|
||||
std::string next_node_ip = "localhost"; // ip address of my next node
|
||||
bool unload = false; // unload layer weights after use or not
|
||||
bool keep_inp_out_in_metal = false; // whether to keep input/output weight in metal, not by default
|
||||
int32_t cuda_mem = 999.0; // cuda memory to use, in GiB
|
||||
int32_t n_predict = -1; // new tokens to predict
|
||||
int32_t n_ctx = 0; // context size
|
||||
|
|
|
@ -312,6 +312,7 @@ extern "C" {
|
|||
bool use_mmap; // use mmap if possible
|
||||
bool use_mlock; // force system to keep model in RAM
|
||||
bool check_tensors; // validate model tensor data
|
||||
bool keep_inp_out_in_metal; // whether to keep input/output weight in metal
|
||||
};
|
||||
|
||||
// NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
|
||||
|
|
262
src/llama.cpp
262
src/llama.cpp
|
@ -5324,7 +5324,6 @@ struct llama_model_loader {
|
|||
// Returns false if cancelled by progress_callback
|
||||
bool load_all_data(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_context * cpu_ctx,
|
||||
llama_buf_map & buffers,
|
||||
llama_buf_range & buffer_ranges,
|
||||
llama_mlocks * lmlocks,
|
||||
|
@ -5441,111 +5440,104 @@ struct llama_model_loader {
|
|||
ggml_backend_name(upload_backend));
|
||||
}
|
||||
|
||||
std::vector<ggml_context *> merged_ctxs = {ctx};
|
||||
if (cpu_ctx != ctx && cpu_ctx != nullptr) {
|
||||
merged_ctxs.push_back(cpu_ctx);
|
||||
}
|
||||
for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
|
||||
const auto * weight = get_weight(ggml_get_name(cur));
|
||||
if (weight == nullptr || !weight->is_needed) {
|
||||
// this can happen with split experts models or this weight is not handled by this device
|
||||
continue;
|
||||
}
|
||||
|
||||
for (ggml_context * ctx0 : merged_ctxs) {
|
||||
for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx0); cur != NULL; cur = ggml_get_next_tensor(ctx0, cur)) {
|
||||
const auto * weight = get_weight(ggml_get_name(cur));
|
||||
if (weight == nullptr || !weight->is_needed) {
|
||||
// this can happen with split experts models or this weight is not handled by this device
|
||||
continue;
|
||||
if (progress_callback) {
|
||||
if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
size_t n_size = ggml_nbytes(cur);
|
||||
|
||||
if (use_mmap) {
|
||||
const auto & mapping = mappings.at(weight->idx);
|
||||
uint8_t * data = (uint8_t *) mapping->addr + weight->offs;
|
||||
if (check_tensors) {
|
||||
validation_result.emplace_back(std::async(std::launch::async, [cur, data, n_size] {
|
||||
return std::make_pair(cur, ggml_validate_row_data(cur->type, data, n_size));
|
||||
}));
|
||||
}
|
||||
|
||||
if (progress_callback) {
|
||||
if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
|
||||
return false;
|
||||
// find the buffer map allocated for the tensor
|
||||
ggml_backend_buffer_t buf_mmap = nullptr;
|
||||
auto bufs = buffers.equal_range(weight->idx);
|
||||
auto ranges = buffer_ranges[ctx][weight->idx];
|
||||
|
||||
for (size_t i = 0; i < ranges.size(); ++i) {
|
||||
size_t first = ranges[i].first;
|
||||
size_t last = ranges[i].second;
|
||||
if (weight->offs >= first && weight->offs + n_size <= last) {
|
||||
auto it = bufs.first;
|
||||
std::advance(it, i);
|
||||
buf_mmap = it->second;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
size_t n_size = ggml_nbytes(cur);
|
||||
GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated
|
||||
if (buf_mmap && cur->data == nullptr) {
|
||||
ggml_backend_tensor_alloc(buf_mmap, cur, data);
|
||||
if (lmlocks) {
|
||||
const auto & lmlock = lmlocks->at(weight->idx);
|
||||
lmlock->grow_to(weight->offs + n_size);
|
||||
}
|
||||
|
||||
if (use_mmap) {
|
||||
const auto & mapping = mappings.at(weight->idx);
|
||||
uint8_t * data = (uint8_t *) mapping->addr + weight->offs;
|
||||
// NOTE: mmap_used is replaced by buffer_ranges
|
||||
// auto & mmap_used = mmaps_used[weight->idx];
|
||||
// mmap_used.first = std::min(mmap_used.first, weight->offs);
|
||||
// mmap_used.second = std::max(mmap_used.second, weight->offs + n_size);
|
||||
} else {
|
||||
ggml_backend_tensor_set(cur, data, 0, n_size);
|
||||
}
|
||||
} else {
|
||||
GGML_ASSERT(weight->idx < files.size());
|
||||
const auto & file = files.at(weight->idx);
|
||||
if (ggml_backend_buffer_is_host(cur->buffer)) {
|
||||
file->seek(weight->offs, SEEK_SET);
|
||||
file->read_raw(cur->data, n_size);
|
||||
if (check_tensors) {
|
||||
validation_result.emplace_back(std::async(std::launch::async, [cur, data, n_size] {
|
||||
return std::make_pair(cur, ggml_validate_row_data(cur->type, data, n_size));
|
||||
validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] {
|
||||
return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size));
|
||||
}));
|
||||
}
|
||||
|
||||
// find the buffer map allocated for the tensor
|
||||
ggml_backend_buffer_t buf_mmap = nullptr;
|
||||
auto bufs = buffers.equal_range(weight->idx);
|
||||
auto ranges = buffer_ranges[ctx][weight->idx];
|
||||
|
||||
for (size_t i = 0; i < ranges.size(); ++i) {
|
||||
size_t first = ranges[i].first;
|
||||
size_t last = ranges[i].second;
|
||||
if (weight->offs >= first && weight->offs + n_size <= last) {
|
||||
auto it = bufs.first;
|
||||
std::advance(it, i);
|
||||
buf_mmap = it->second;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated
|
||||
if (buf_mmap && cur->data == nullptr) {
|
||||
ggml_backend_tensor_alloc(buf_mmap, cur, data);
|
||||
if (lmlocks) {
|
||||
const auto & lmlock = lmlocks->at(weight->idx);
|
||||
lmlock->grow_to(weight->offs + n_size);
|
||||
}
|
||||
|
||||
// NOTE: mmap_used is replaced by buffer_ranges
|
||||
// auto & mmap_used = mmaps_used[weight->idx];
|
||||
// mmap_used.first = std::min(mmap_used.first, weight->offs);
|
||||
// mmap_used.second = std::max(mmap_used.second, weight->offs + n_size);
|
||||
} else {
|
||||
ggml_backend_tensor_set(cur, data, 0, n_size);
|
||||
}
|
||||
} else {
|
||||
GGML_ASSERT(weight->idx < files.size());
|
||||
const auto & file = files.at(weight->idx);
|
||||
if (ggml_backend_buffer_is_host(cur->buffer)) {
|
||||
// If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU.
|
||||
if (upload_backend) {
|
||||
file->seek(weight->offs, SEEK_SET);
|
||||
file->read_raw(cur->data, n_size);
|
||||
if (check_tensors) {
|
||||
validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] {
|
||||
return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size));
|
||||
}));
|
||||
|
||||
size_t bytes_read = 0;
|
||||
|
||||
while (bytes_read < n_size) {
|
||||
size_t read_iteration = std::min<size_t>(buffer_size, n_size - bytes_read);
|
||||
|
||||
ggml_backend_event_synchronize(events[buffer_idx]);
|
||||
file->read_raw(host_ptrs[buffer_idx], read_iteration);
|
||||
ggml_backend_tensor_set_async(upload_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration);
|
||||
ggml_backend_event_record(events[buffer_idx], upload_backend);
|
||||
|
||||
bytes_read += read_iteration;
|
||||
++buffer_idx;
|
||||
buffer_idx %= n_buffers;
|
||||
}
|
||||
} else {
|
||||
// If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU.
|
||||
if (upload_backend) {
|
||||
file->seek(weight->offs, SEEK_SET);
|
||||
|
||||
size_t bytes_read = 0;
|
||||
|
||||
while (bytes_read < n_size) {
|
||||
size_t read_iteration = std::min<size_t>(buffer_size, n_size - bytes_read);
|
||||
|
||||
ggml_backend_event_synchronize(events[buffer_idx]);
|
||||
file->read_raw(host_ptrs[buffer_idx], read_iteration);
|
||||
ggml_backend_tensor_set_async(upload_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration);
|
||||
ggml_backend_event_record(events[buffer_idx], upload_backend);
|
||||
|
||||
bytes_read += read_iteration;
|
||||
++buffer_idx;
|
||||
buffer_idx %= n_buffers;
|
||||
}
|
||||
} else {
|
||||
read_buf.resize(n_size);
|
||||
file->seek(weight->offs, SEEK_SET);
|
||||
file->read_raw(read_buf.data(), n_size);
|
||||
ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
|
||||
if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
|
||||
throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
|
||||
}
|
||||
read_buf.resize(n_size);
|
||||
file->seek(weight->offs, SEEK_SET);
|
||||
file->read_raw(read_buf.data(), n_size);
|
||||
ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
|
||||
if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
|
||||
throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
size_done += n_size;
|
||||
}
|
||||
|
||||
size_done += n_size;
|
||||
}
|
||||
|
||||
// free temporary resources used for async uploads
|
||||
|
@ -5571,46 +5563,10 @@ struct llama_model_loader {
|
|||
throw std::runtime_error("found tensors with invalid data");
|
||||
}
|
||||
|
||||
// check if this is the last call and do final cleanup
|
||||
if (size_done >= size_data) {
|
||||
// unmap offloaded tensors and metadata
|
||||
if (use_mmap) {
|
||||
for (uint32_t idx = 0; idx < mappings.size(); idx++) {
|
||||
auto & mapping = mappings.at(idx);
|
||||
auto & ranges_used = buffer_ranges[ctx][idx];
|
||||
|
||||
std::sort(ranges_used.begin(), ranges_used.end(), [](
|
||||
const std::pair<size_t, size_t> & a, const std::pair<size_t, size_t> & b) {
|
||||
return a.first < b.first;
|
||||
});
|
||||
|
||||
size_t prev_end = 0;
|
||||
for (const auto & range : ranges_used) {
|
||||
size_t first = range.first;
|
||||
size_t last = range.second;
|
||||
if (first > prev_end) {
|
||||
mapping->unmap_fragment(prev_end, first);
|
||||
}
|
||||
prev_end = last;
|
||||
}
|
||||
|
||||
if (prev_end < mapping->size) {
|
||||
mapping->unmap_fragment(prev_end, mapping->size);
|
||||
}
|
||||
|
||||
// NOTE: mmap_used is replaced by buffer_ranges
|
||||
// const auto & mmap_used = mmaps_used.at(idx);
|
||||
// mapping->unmap_fragment(0, mmap_used.first);
|
||||
// if (mmap_used.second != 0) {
|
||||
// mapping->unmap_fragment(mmap_used.second, mapping->size);
|
||||
// }
|
||||
}
|
||||
}
|
||||
if (progress_callback) {
|
||||
// Even though the model is done loading, we still honor
|
||||
// cancellation since we need to free allocations.
|
||||
return progress_callback(1.0f, progress_callback_user_data);
|
||||
}
|
||||
if (progress_callback) {
|
||||
// Even though the model is done loading, we still honor
|
||||
// cancellation since we need to free allocations.
|
||||
return progress_callback(1.0f, progress_callback_user_data);
|
||||
}
|
||||
|
||||
return true;
|
||||
|
@ -7430,6 +7386,7 @@ static bool llm_load_tensors_impl(
|
|||
enum llama_split_mode split_mode,
|
||||
int main_gpu,
|
||||
bool use_mlock,
|
||||
bool keep_inp_out_in_metal,
|
||||
llama_progress_callback progress_callback,
|
||||
void * progress_callback_user_data) {
|
||||
auto & hparams = model.hparams;
|
||||
|
@ -9315,7 +9272,7 @@ static bool llm_load_tensors_impl(
|
|||
void * addr = nullptr;
|
||||
auto & ranges = ctx_buffer_ranges[idx];
|
||||
|
||||
ml.get_mapping_ranges(ranges, &addr, idx, ctx, cpu_ctx);
|
||||
ml.get_mapping_ranges(ranges, &addr, idx, ctx, keep_inp_out_in_metal ? cpu_ctx : nullptr);
|
||||
|
||||
for (const auto & range : ranges) {
|
||||
size_t first = range.first;
|
||||
|
@ -9373,11 +9330,54 @@ static bool llm_load_tensors_impl(
|
|||
for (auto & it : ctx_bufs) {
|
||||
ggml_context * ctx = it.first;
|
||||
auto & bufs = it.second;
|
||||
if (!ml.load_all_data(ctx, cpu_ctx, bufs, buffer_ranges, use_mlock ? &model.mlock_mmaps : NULL, progress_callback, progress_callback_user_data)) {
|
||||
if(!ml.load_all_data(ctx, bufs, buffer_ranges, use_mlock ? &model.mlock_mmaps : NULL, progress_callback, progress_callback_user_data)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// check if this is the last call and do final cleanup
|
||||
if (ml.size_done >= ml.size_data) {
|
||||
// unmap offloaded tensors and metadata
|
||||
if (ml.use_mmap) {
|
||||
for (uint32_t idx = 0; idx < ml.mappings.size(); idx++) {
|
||||
auto & mapping = ml.mappings.at(idx);
|
||||
|
||||
// flatten ranges from all contexts into a vector
|
||||
std::vector<std::pair<size_t, size_t>> ranges_used;
|
||||
for (auto & ctx_buf_ranges : buffer_ranges) {
|
||||
auto & ranges = ctx_buf_ranges.second[idx];
|
||||
ranges_used.insert(ranges_used.end(), ranges.begin(), ranges.end());
|
||||
}
|
||||
|
||||
std::sort(ranges_used.begin(), ranges_used.end(), [](
|
||||
const std::pair<size_t, size_t> & a, const std::pair<size_t, size_t> & b) {
|
||||
return a.first < b.first;
|
||||
});
|
||||
|
||||
size_t prev_end = 0;
|
||||
for (const auto & range : ranges_used) {
|
||||
size_t first = range.first;
|
||||
size_t last = range.second;
|
||||
if (first > prev_end) {
|
||||
mapping->unmap_fragment(prev_end, first);
|
||||
}
|
||||
prev_end = std::max(prev_end, last);
|
||||
}
|
||||
|
||||
if (prev_end < mapping->size) {
|
||||
mapping->unmap_fragment(prev_end, mapping->size);
|
||||
}
|
||||
|
||||
// NOTE: mmap_used is replaced by buffer_ranges
|
||||
// const auto & mmap_used = mmaps_used.at(idx);
|
||||
// mapping->unmap_fragment(0, mmap_used.first);
|
||||
// if (mmap_used.second != 0) {
|
||||
// mapping->unmap_fragment(mmap_used.second, mapping->size);
|
||||
// }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (use_mmap_buffer) {
|
||||
for (auto & mapping : ml.mappings) {
|
||||
model.mappings.emplace_back(std::move(mapping));
|
||||
|
@ -9396,7 +9396,7 @@ int llm_load_tensors(
|
|||
try {
|
||||
if (!llm_load_tensors_impl(
|
||||
*ml, *model, params.n_world, params.rank, params.n_layer_window, params.n_gpu_layers, params.split_mode,
|
||||
params.main_gpu, params.use_mlock, params.progress_callback, params.progress_callback_user_data
|
||||
params.main_gpu, params.use_mlock, params.keep_inp_out_in_metal, params.progress_callback, params.progress_callback_user_data
|
||||
)) {
|
||||
return -2;
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue