From ac5d63b09e465e0b3e8c998088ff2f550055b850 Mon Sep 17 00:00:00 2001
From: Lizonghang <870644199@qq.com>
Date: Sat, 25 Jan 2025 23:51:16 +0400
Subject: [PATCH] add explaination for why the output layer weights should be
 kept in metal shared memory

---
 common/arg.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/common/arg.cpp b/common/arg.cpp
index 1cab211d..12c7788c 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -738,6 +738,9 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
         }
     ).set_env("LLAMA_ARG_CUDA_MEM"));
 // #ifdef GGML_USE_METAL
+//     // warn: if the output layer weights are not kept in metal shared memory, its mmap-ed weight data
+//     // could be released by the OS and reloaded repeatedly, which causes additional disk I/O latency.
+//     // so we recommend to keep the output layer weights in metal shared memory.
 //     add_opt(llama_arg(
 //         {"--keep-out-in-metal"},
 //         format("whether to keep output weights in metal memory (default: %s)", params.keep_out_in_metal ? "true" : "false"),