keep the output layer weights in shared memory by default

This commit is contained in:
Lizonghang 2025-01-25 23:31:43 +04:00
parent f3dd5776eb
commit 1ca9a43bd1
4 changed files with 16 additions and 16 deletions

View file

@ -737,15 +737,15 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
params.gpu_mem = value; // in GiB
}
).set_env("LLAMA_ARG_CUDA_MEM"));
#ifdef GGML_USE_METAL
add_opt(llama_arg(
{"--keep-out-in-metal"},
format("whether to keep output weights in metal memory (default: %s)", params.keep_out_in_metal ? "true" : "false"),
[](gpt_params & params) {
params.keep_out_in_metal = true;
}
).set_env("LLAMA_ARG_KEEP_INP_OUT_IN_METAL"));
#endif
// #ifdef GGML_USE_METAL
// add_opt(llama_arg(
// {"--keep-out-in-metal"},
// format("whether to keep output weights in metal memory (default: %s)", params.keep_out_in_metal ? "true" : "false"),
// [](gpt_params & params) {
// params.keep_out_in_metal = true;
// }
// ).set_env("LLAMA_ARG_KEEP_INP_OUT_IN_METAL"));
// #endif
add_opt(llama_arg(
{"-n", "--predict", "--n-predict"}, "N",
format("number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict),