mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 01:24:36 +00:00
Merge branch 'master' into concedo_experimental
# Conflicts: # .devops/nix/package.nix # .github/workflows/build.yml # CMakeLists.txt # Makefile # README.md # flake.nix # llama.cpp # llama.h # tests/test-llama-grammar.cpp
This commit is contained in:
commit
6dc01297f8
36 changed files with 2917 additions and 10930 deletions
|
@ -1790,28 +1790,28 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
|||
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
||||
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
||||
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
|
||||
if (llama_mlock_supported())
|
||||
if (llama_supports_mlock())
|
||||
{
|
||||
printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
|
||||
}
|
||||
if (llama_mmap_supported())
|
||||
if (llama_supports_mmap())
|
||||
{
|
||||
printf(" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
|
||||
}
|
||||
printf(" --numa attempt optimizations that help on some NUMA systems\n");
|
||||
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||
printf(" -ngl N, --n-gpu-layers N\n");
|
||||
printf(" number of layers to store in VRAM\n");
|
||||
printf(" -sm SPLIT_MODE, --split-mode SPLIT_MODE\n");
|
||||
printf(" how to split the model across multiple GPUs, one of:\n");
|
||||
printf(" - none: use one GPU only\n");
|
||||
printf(" - layer (default): split layers and KV across GPUs\n");
|
||||
printf(" - row: split rows across GPUs\n");
|
||||
printf(" -ts SPLIT --tensor-split SPLIT\n");
|
||||
printf(" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n");
|
||||
printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n");
|
||||
printf(" or for intermediate results and KV (with split-mode = row)\n");
|
||||
#endif
|
||||
if (llama_supports_gpu_offload()) {
|
||||
printf(" -ngl N, --n-gpu-layers N\n");
|
||||
printf(" number of layers to store in VRAM\n");
|
||||
printf(" -sm SPLIT_MODE, --split-mode SPLIT_MODE\n");
|
||||
printf(" how to split the model across multiple GPUs, one of:\n");
|
||||
printf(" - none: use one GPU only\n");
|
||||
printf(" - layer (default): split layers and KV across GPUs\n");
|
||||
printf(" - row: split rows across GPUs\n");
|
||||
printf(" -ts SPLIT --tensor-split SPLIT\n");
|
||||
printf(" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n");
|
||||
printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n");
|
||||
printf(" or for intermediate results and KV (with split-mode = row)\n");
|
||||
}
|
||||
printf(" -m FNAME, --model FNAME\n");
|
||||
printf(" model path (default: %s)\n", params.model.c_str());
|
||||
printf(" -a ALIAS, --alias ALIAS\n");
|
||||
|
@ -1885,7 +1885,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
|||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
sparams.api_keys.push_back(argv[i]);
|
||||
sparams.api_keys.emplace_back(argv[i]);
|
||||
}
|
||||
else if (arg == "--api-key-file")
|
||||
{
|
||||
|
@ -2067,13 +2067,13 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
|||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||
params.n_gpu_layers = std::stoi(argv[i]);
|
||||
#else
|
||||
LOG_WARNING("Not compiled with GPU offload support, --n-gpu-layers option will be ignored. "
|
||||
if (llama_supports_gpu_offload()) {
|
||||
params.n_gpu_layers = std::stoi(argv[i]);
|
||||
} else {
|
||||
LOG_WARNING("Not compiled with GPU offload support, --n-gpu-layers option will be ignored. "
|
||||
"See main README.md for information on enabling GPU BLAS support",
|
||||
{{"n_gpu_layers", params.n_gpu_layers}});
|
||||
#endif
|
||||
}
|
||||
}
|
||||
else if (arg == "--split-mode" || arg == "-sm")
|
||||
{
|
||||
|
@ -2116,9 +2116,9 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
|||
const std::regex regex{R"([,/]+)"};
|
||||
std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
|
||||
std::vector<std::string> split_arg{it, {}};
|
||||
GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES);
|
||||
GGML_ASSERT(split_arg.size() <= llama_max_devices());
|
||||
|
||||
for (size_t i_device = 0; i_device < LLAMA_MAX_DEVICES; ++i_device)
|
||||
for (size_t i_device = 0; i_device < llama_max_devices(); ++i_device)
|
||||
{
|
||||
if (i_device < split_arg.size())
|
||||
{
|
||||
|
@ -2161,7 +2161,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
|||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.lora_adapter.push_back(std::make_tuple(argv[i], 1.0f));
|
||||
params.lora_adapter.emplace_back(argv[i], 1.0f);
|
||||
params.use_mmap = false;
|
||||
}
|
||||
else if (arg == "--lora-scaled")
|
||||
|
@ -2177,7 +2177,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
|||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.lora_adapter.push_back(std::make_tuple(lora_adapter, std::stof(argv[i])));
|
||||
params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
|
||||
params.use_mmap = false;
|
||||
}
|
||||
else if (arg == "--lora-base")
|
||||
|
@ -2319,7 +2319,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
|||
}
|
||||
}
|
||||
if (!params.kv_overrides.empty()) {
|
||||
params.kv_overrides.emplace_back(llama_model_kv_override());
|
||||
params.kv_overrides.emplace_back();
|
||||
params.kv_overrides.back().key[0] = 0;
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue