llama : allow other bufts when overriding to CPU, add --no-repack option (#14990)

This commit is contained in:
Diego Devesa 2025-07-31 09:11:34 -07:00 committed by GitHub
parent e08a98826b
commit d6818d06a6
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 39 additions and 21 deletions

View file

@ -2095,6 +2095,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.no_kv_offload = true; params.no_kv_offload = true;
} }
).set_env("LLAMA_ARG_NO_KV_OFFLOAD")); ).set_env("LLAMA_ARG_NO_KV_OFFLOAD"));
add_opt(common_arg(
{"-nr", "--no-repack"},
"disable weight repacking",
[](common_params & params) {
params.no_extra_bufts = true;
}
).set_env("LLAMA_ARG_NO_REPACK"));
add_opt(common_arg( add_opt(common_arg(
{"-ctk", "--cache-type-k"}, "TYPE", {"-ctk", "--cache-type-k"}, "TYPE",
string_format( string_format(

View file

@ -1122,6 +1122,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
mparams.use_mmap = params.use_mmap; mparams.use_mmap = params.use_mmap;
mparams.use_mlock = params.use_mlock; mparams.use_mlock = params.use_mlock;
mparams.check_tensors = params.check_tensors; mparams.check_tensors = params.check_tensors;
mparams.use_extra_bufts = !params.no_extra_bufts;
if (params.kv_overrides.empty()) { if (params.kv_overrides.empty()) {
mparams.kv_overrides = NULL; mparams.kv_overrides = NULL;

View file

@ -359,6 +359,7 @@ struct common_params {
bool warmup = true; // warmup run bool warmup = true; // warmup run
bool check_tensors = false; // validate tensor data bool check_tensors = false; // validate tensor data
bool no_op_offload = false; // globally disable offload host tensor operations to device bool no_op_offload = false; // globally disable offload host tensor operations to device
bool no_extra_bufts = false; // disable extra buffer types (used for weight repacking)
bool single_turn = false; // single turn chat conversation bool single_turn = false; // single turn chat conversation

View file

@ -288,6 +288,7 @@ extern "C" {
bool use_mmap; // use mmap if possible bool use_mmap; // use mmap if possible
bool use_mlock; // force system to keep model in RAM bool use_mlock; // force system to keep model in RAM
bool check_tensors; // validate model tensor data bool check_tensors; // validate model tensor data
bool use_extra_bufts; // use extra buffer types (used for weight repacking)
}; };
// NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations

View file

@ -290,7 +290,7 @@ static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hpara
} }
// CPU: ACCEL -> GPU host -> CPU extra -> CPU // CPU: ACCEL -> GPU host -> CPU extra -> CPU
static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices) { static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices, bool use_extra_bufts) {
buft_list_t buft_list; buft_list_t buft_list;
// add ACCEL buffer types // add ACCEL buffer types
@ -319,8 +319,8 @@ static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & de
} }
} }
// add extra buffer types, only if no GPU device is present // add extra buffer types
// ref: https://github.com/ggml-org/llama.cpp/issues/12481#issuecomment-2743136094 if (use_extra_bufts) {
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
if (cpu_dev == nullptr) { if (cpu_dev == nullptr) {
throw std::runtime_error(format("%s: no CPU backend found", __func__)); throw std::runtime_error(format("%s: no CPU backend found", __func__));
@ -336,6 +336,7 @@ static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & de
++extra_bufts; ++extra_bufts;
} }
} }
}
// add the CPU buffer type // add the CPU buffer type
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
@ -1839,7 +1840,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s)\n", __func__, ml.use_mmap ? "true" : "false"); LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s)\n", __func__, ml.use_mmap ? "true" : "false");
// build a list of buffer types for the CPU and GPU devices // build a list of buffer types for the CPU and GPU devices
pimpl->cpu_buft_list = make_cpu_buft_list(devices); pimpl->cpu_buft_list = make_cpu_buft_list(devices, params.use_extra_bufts);
for (auto * dev : devices) { for (auto * dev : devices) {
buft_list_t buft_list = make_gpu_buft_list(dev, split_mode, tensor_split); buft_list_t buft_list = make_gpu_buft_list(dev, split_mode, tensor_split);
// add CPU buffer types as a fallback // add CPU buffer types as a fallback
@ -2044,7 +2045,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) { for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
std::regex pattern(overrides->pattern); std::regex pattern(overrides->pattern);
if (std::regex_search(tensor_name, pattern)) { if (std::regex_search(tensor_name, pattern)) {
if (overrides->buft == ggml_backend_cpu_buffer_type()) {
// when overriding to a CPU buffer, consider the extra buffer types
buft = select_weight_buft(hparams, t_meta, op, pimpl->cpu_buft_list);
} else {
buft = overrides->buft; buft = overrides->buft;
}
LLAMA_LOG_DEBUG("tensor %s (%zu MiB %s) buffer type overridden to %s\n", LLAMA_LOG_DEBUG("tensor %s (%zu MiB %s) buffer type overridden to %s\n",
tensor_name.c_str(), tensor_name.c_str(),
ggml_nbytes(t_meta) / 1024 / 1024, ggml_type_name(t_meta->type), ggml_nbytes(t_meta) / 1024 / 1024, ggml_type_name(t_meta->type),
@ -17839,6 +17846,7 @@ llama_model_params llama_model_default_params() {
/*.use_mmap =*/ true, /*.use_mmap =*/ true,
/*.use_mlock =*/ false, /*.use_mlock =*/ false,
/*.check_tensors =*/ false, /*.check_tensors =*/ false,
/*.use_extra_bufts =*/ true,
}; };
#ifdef GGML_USE_METAL #ifdef GGML_USE_METAL