mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-10 17:14:36 +00:00
llama : allow other bufts when overriding to CPU, add --no-repack option (#14990)
This commit is contained in:
parent
e08a98826b
commit
d6818d06a6
5 changed files with 39 additions and 21 deletions
|
@ -2095,6 +2095,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||
params.no_kv_offload = true;
|
||||
}
|
||||
).set_env("LLAMA_ARG_NO_KV_OFFLOAD"));
|
||||
add_opt(common_arg(
|
||||
{"-nr", "--no-repack"},
|
||||
"disable weight repacking",
|
||||
[](common_params & params) {
|
||||
params.no_extra_bufts = true;
|
||||
}
|
||||
).set_env("LLAMA_ARG_NO_REPACK"));
|
||||
add_opt(common_arg(
|
||||
{"-ctk", "--cache-type-k"}, "TYPE",
|
||||
string_format(
|
||||
|
|
|
@ -1122,6 +1122,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
|
|||
mparams.use_mmap = params.use_mmap;
|
||||
mparams.use_mlock = params.use_mlock;
|
||||
mparams.check_tensors = params.check_tensors;
|
||||
mparams.use_extra_bufts = !params.no_extra_bufts;
|
||||
|
||||
if (params.kv_overrides.empty()) {
|
||||
mparams.kv_overrides = NULL;
|
||||
|
|
|
@ -359,6 +359,7 @@ struct common_params {
|
|||
bool warmup = true; // warmup run
|
||||
bool check_tensors = false; // validate tensor data
|
||||
bool no_op_offload = false; // globally disable offload host tensor operations to device
|
||||
bool no_extra_bufts = false; // disable extra buffer types (used for weight repacking)
|
||||
|
||||
bool single_turn = false; // single turn chat conversation
|
||||
|
||||
|
|
|
@ -288,6 +288,7 @@ extern "C" {
|
|||
bool use_mmap; // use mmap if possible
|
||||
bool use_mlock; // force system to keep model in RAM
|
||||
bool check_tensors; // validate model tensor data
|
||||
bool use_extra_bufts; // use extra buffer types (used for weight repacking)
|
||||
};
|
||||
|
||||
// NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
|
||||
|
|
|
@ -290,7 +290,7 @@ static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hpara
|
|||
}
|
||||
|
||||
// CPU: ACCEL -> GPU host -> CPU extra -> CPU
|
||||
static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices) {
|
||||
static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices, bool use_extra_bufts) {
|
||||
buft_list_t buft_list;
|
||||
|
||||
// add ACCEL buffer types
|
||||
|
@ -319,8 +319,8 @@ static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & de
|
|||
}
|
||||
}
|
||||
|
||||
// add extra buffer types, only if no GPU device is present
|
||||
// ref: https://github.com/ggml-org/llama.cpp/issues/12481#issuecomment-2743136094
|
||||
// add extra buffer types
|
||||
if (use_extra_bufts) {
|
||||
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||
if (cpu_dev == nullptr) {
|
||||
throw std::runtime_error(format("%s: no CPU backend found", __func__));
|
||||
|
@ -336,6 +336,7 @@ static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & de
|
|||
++extra_bufts;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// add the CPU buffer type
|
||||
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
|
||||
|
@ -1839,7 +1840,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s)\n", __func__, ml.use_mmap ? "true" : "false");
|
||||
|
||||
// build a list of buffer types for the CPU and GPU devices
|
||||
pimpl->cpu_buft_list = make_cpu_buft_list(devices);
|
||||
pimpl->cpu_buft_list = make_cpu_buft_list(devices, params.use_extra_bufts);
|
||||
for (auto * dev : devices) {
|
||||
buft_list_t buft_list = make_gpu_buft_list(dev, split_mode, tensor_split);
|
||||
// add CPU buffer types as a fallback
|
||||
|
@ -2044,7 +2045,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
|
||||
std::regex pattern(overrides->pattern);
|
||||
if (std::regex_search(tensor_name, pattern)) {
|
||||
if (overrides->buft == ggml_backend_cpu_buffer_type()) {
|
||||
// when overriding to a CPU buffer, consider the extra buffer types
|
||||
buft = select_weight_buft(hparams, t_meta, op, pimpl->cpu_buft_list);
|
||||
} else {
|
||||
buft = overrides->buft;
|
||||
}
|
||||
|
||||
LLAMA_LOG_DEBUG("tensor %s (%zu MiB %s) buffer type overridden to %s\n",
|
||||
tensor_name.c_str(),
|
||||
ggml_nbytes(t_meta) / 1024 / 1024, ggml_type_name(t_meta->type),
|
||||
|
@ -17839,6 +17846,7 @@ llama_model_params llama_model_default_params() {
|
|||
/*.use_mmap =*/ true,
|
||||
/*.use_mlock =*/ false,
|
||||
/*.check_tensors =*/ false,
|
||||
/*.use_extra_bufts =*/ true,
|
||||
};
|
||||
|
||||
#ifdef GGML_USE_METAL
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue