From ac5d63b09e465e0b3e8c998088ff2f550055b850 Mon Sep 17 00:00:00 2001 From: Lizonghang <870644199@qq.com> Date: Sat, 25 Jan 2025 23:51:16 +0400 Subject: [PATCH] add explaination for why the output layer weights should be kept in metal shared memory --- common/arg.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/common/arg.cpp b/common/arg.cpp index 1cab211d..12c7788c 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -738,6 +738,9 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, } ).set_env("LLAMA_ARG_CUDA_MEM")); // #ifdef GGML_USE_METAL +// // warn: if the output layer weights are not kept in metal shared memory, its mmap-ed weight data +// // could be released by the OS and reloaded repeatedly, which causes additional disk I/O latency. +// // so we recommend to keep the output layer weights in metal shared memory. // add_opt(llama_arg( // {"--keep-out-in-metal"}, // format("whether to keep output weights in metal memory (default: %s)", params.keep_out_in_metal ? "true" : "false"),