swa full used unless ctx shift and fast forward disabled

2025-09-16 03:49:42 +00:00 · 2025-05-21 22:47:45 +08:00 · 2025-05-21 22:47:45 +08:00 · 9f976e9c65
commit 9f976e9c65
parent 5b6ed445de e298d2fbd0
16 changed files with 1429 additions and 654 deletions
--- a/common/arg.cpp
+++ b/common/arg.cpp
@ -1446,6 +1446,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.n_keep = value;
        }
    ));
+    add_opt(common_arg(
+        {"--swa-full"},
+        string_format("use full-size SWA cache (default: %s)\n"
+            "[(more info)](https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)", params.swa_full ? "true" : "false"),
+        [](common_params & params) {
+            params.swa_full = true;
+        }
+    ));
    add_opt(common_arg(
        {"--no-context-shift"},
        string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
--- a/common/common.cpp
+++ b/common/common.cpp
@ -1144,6 +1144,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
    cparams.flash_attn        = params.flash_attn;
    cparams.no_perf           = params.no_perf;
    cparams.op_offload        = !params.no_op_offload;
+    cparams.swa_full          = params.swa_full;

    if (params.reranking) {
        cparams.embeddings    = true;
--- a/common/common.h
+++ b/common/common.h
@ -319,6 +319,7 @@ struct common_params {
    bool flash_attn        = false; // flash attention
    bool no_perf           = false; // disable performance metrics
    bool ctx_shift         = true;  // context shift on inifinite text generation
+    bool swa_full          = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)

    bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
    bool use_mmap          = true;  // use mmap for faster loads