swa full used unless ctx shift and fast forward disabled

This commit is contained in:
Concedo 2025-05-21 22:47:45 +08:00
commit 9f976e9c65
16 changed files with 1429 additions and 654 deletions

View file

@ -1446,6 +1446,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.n_keep = value;
}
));
add_opt(common_arg(
{"--swa-full"},
string_format("use full-size SWA cache (default: %s)\n"
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)", params.swa_full ? "true" : "false"),
[](common_params & params) {
params.swa_full = true;
}
));
add_opt(common_arg(
{"--no-context-shift"},
string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),

View file

@ -1144,6 +1144,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
cparams.flash_attn = params.flash_attn;
cparams.no_perf = params.no_perf;
cparams.op_offload = !params.no_op_offload;
cparams.swa_full = params.swa_full;
if (params.reranking) {
cparams.embeddings = true;

View file

@ -319,6 +319,7 @@ struct common_params {
bool flash_attn = false; // flash attention
bool no_perf = false; // disable performance metrics
bool ctx_shift = true; // context shift on inifinite text generation
bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
bool use_mmap = true; // use mmap for faster loads