added swa padding controls

2026-05-18 23:49:46 +00:00 · 2026-04-16 16:21:48 +08:00 · 2026-04-16 16:21:48 +08:00 · 0251c6dbde
commit 0251c6dbde
parent a9e817fb4c
5 changed files with 17 additions and 4 deletions
--- a/expose.h
+++ b/expose.h
@ -76,6 +76,7 @@ struct load_model_inputs
    const bool check_slowness = false;
    const bool highpriority = false;
    const bool swa_support = false;
+    const int swa_padding = 0;
    const bool smartcache = false;
    const int smartcacheslots = 0;
    const bool pipelineparallel = false;
--- a/gpttype_adapter.cpp
+++ b/gpttype_adapter.cpp
@ -2173,6 +2173,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
    kcpp_data->use_fastforward = inputs.use_fastforward;
    kcpp_data->smartcache = inputs.smartcache;
    kcpp_data->swa_full = !inputs.swa_support;
+    kcpp_extra_swa_padding = inputs.swa_padding;
    if (!kcpp_data->swa_full) {
        if (inputs.use_contextshift) {
            kcpp_data->swa_full = true;  //cannot use SWA
--- a/koboldcpp.py
+++ b/koboldcpp.py
@ -63,6 +63,7 @@ overridekv_max = 16
 default_autofit_padding = 1024
 lora_filenames_max = 4
 multiuser_concurrent_limit = 10
+swa_padding_default = 1024

 # abuse prevention
 stop_token_max = 256
@ -260,6 +261,7 @@ class load_model_inputs(ctypes.Structure):
                ("check_slowness", ctypes.c_bool),
                ("highpriority", ctypes.c_bool),
                ("swa_support", ctypes.c_bool),
+                ("swa_padding", ctypes.c_int),
                ("smartcache", ctypes.c_bool),
                ("smartcacheslots", ctypes.c_int),
                ("pipelineparallel", ctypes.c_bool),
@ -1943,6 +1945,7 @@ def load_model(model_filename):
    inputs.check_slowness = (not args.highpriority and os.name == 'nt' and 'Intel' in platform.processor())
    inputs.highpriority = args.highpriority
    inputs.swa_support = args.useswa
+    inputs.swa_padding = args.swapadding
    scint = int(args.smartcache)
    inputs.smartcache = False if scint<=0 else True
    sclimit = (savestate_limit_default if scint<=1 else scint)
@ -7245,6 +7248,7 @@ def show_gui():
    contextshift_var = ctk.IntVar(value=1)
    fastforward_var = ctk.IntVar(value=1)
    swa_var = ctk.IntVar(value=0)
+    swa_padding_var = ctk.StringVar(value=str(swa_padding_default))
    smartcache_var = ctk.IntVar(value=0)
    smartcacheslots_var = ctk.StringVar(value=str(savestate_limit_default))
    remotetunnel_var = ctk.IntVar(value=0)
@ -7960,7 +7964,8 @@ def show_gui():
    smartcontextbox = makecheckbox(context_tab, "Use SmartContext", smartcontext_var, 1,tooltiptxt="Uses SmartContext. Now considered outdated and not recommended.\nCheck the wiki for more info.")
    makecheckbox(context_tab, "Use ContextShift", contextshift_var, 2,tooltiptxt="Uses Context Shifting to reduce reprocessing.\nRecommended. Check the wiki for more info.", command=togglectxshift)
    makecheckbox(context_tab, "Use FastForwarding", fastforward_var, 3,tooltiptxt="Use fast forwarding to recycle previous context (always reprocess if disabled).\nRecommended.", command=togglefastforward)
-    makecheckbox(context_tab, "Use Sliding Window Attention (SWA)", swa_var, 4,tooltiptxt="Allows Sliding Window Attention (SWA) KV Cache, which saves memory but cannot be used with context shifting.", command=toggleswa)
+    makecheckbox(context_tab, "Use SWA", swa_var, 4,tooltiptxt="Allows Sliding Window Attention (SWA) KV Cache, which saves memory but cannot be used with context shifting.", command=toggleswa)
+    swa_padding_entry,swa_padding_label = makelabelentry(context_tab,"SWA Padding Tokens:", swa_padding_var, 4, 50, padx=300,singleline=True,tooltip="If the SWA is too small, you can expand it with padding, allowing for greater distance context rewinds.",labelpadx=160)
    makecheckbox(context_tab, "Use SmartCache", smartcache_var, 5,tooltiptxt="Enables intelligent context switching by saving KV cache snapshots to RAM. Requires fast forwarding.", command=togglesmartcache)
    makelabelentry(context_tab, "CacheSlots:", smartcacheslots_var, row=5, padx=(300), singleline=True, tooltip="Number of slots for smartcache",labelpadx=(220))

@ -8301,6 +8306,7 @@ def show_gui():
        args.noshift = contextshift_var.get()==0
        args.nofastforward = fastforward_var.get()==0
        args.useswa = swa_var.get()==1
+        args.swapadding = int(swa_padding_var.get()) if swa_padding_var.get()!="" else 0
        args.smartcache = (0 if smartcache_var.get()!=1 else int(smartcacheslots_var.get()))
        args.remotetunnel = remotetunnel_var.get()==1
        args.foreground = keepforeground.get()==1
@ -8550,6 +8556,7 @@ def show_gui():
        contextshift_var.set(0 if "noshift" in mydict and mydict["noshift"] else 1)
        fastforward_var.set(0 if "nofastforward" in mydict and mydict["nofastforward"] else 1)
        swa_var.set(1 if "useswa" in mydict and mydict["useswa"] else 0)
+        swa_padding_var.set(mydict["swapadding"] if ("swapadding" in mydict and mydict["swapadding"]) else 0)
        smartcache_var.set(1 if "smartcache" in mydict and mydict["smartcache"] else 0)
        smartcacheslots_var.set(mydict["smartcache"] if ("smartcache" in mydict and mydict["smartcache"] and int(mydict["smartcache"])>1) else savestate_limit_default)
        remotetunnel_var.set(1 if "remotetunnel" in mydict and mydict["remotetunnel"] else 0)
@ -11133,6 +11140,7 @@ if __name__ == '__main__':
    advparser.add_argument("--noshift","--no-context-shift", help="If set, do not attempt to Trim and Shift the GGUF context.", action='store_true')
    advparser.add_argument("--nofastforward", help="If set, do not attempt to fast forward GGUF context (always reprocess). Will also enable noshift", action='store_true')
    advparser.add_argument("--useswa", help="If set, allows Sliding Window Attention (SWA) KV Cache, which saves memory but cannot be used with context shifting.", action='store_true')
+    advparser.add_argument("--swapadding", help="How much extra to pad the SWA KV cache, this affects the rewind limit before reprocessing is forced.", type=int, default=swa_padding_default)
    advparser.add_argument("--smartcache", help="Enables intelligent context switching by saving KV cache snapshots to RAM. Requires fast forwarding.", metavar=('limit'), nargs='?', const=1, type=int, default=0)
    advparser.add_argument("--ropeconfig", help="If set, uses customized RoPE scaling from configured frequency scale and frequency base (e.g. --ropeconfig 0.25 10000). Otherwise, uses NTK-Aware scaling set automatically based on context size. For linear rope, simply set the freq-scale and ignore the freq-base",metavar=('[rope-freq-scale]', '[rope-freq-base]'), default=[0.0, 10000.0], type=float, nargs='+')
    advparser.add_argument("--overridenativecontext", help="Overrides the native trained context of the loaded model with a custom value to be used for Rope scaling.",metavar=('[trained context]'), type=int, default=0)
--- a/src/llama-kv-cache-iswa.cpp
+++ b/src/llama-kv-cache-iswa.cpp
@ -10,6 +10,8 @@
 //
 // llama_kv_cache_iswa
 //
+//kcpp: use a global flag to adjust swa padding
+static int kcpp_extra_swa_padding = 0;

 llama_kv_cache_iswa::llama_kv_cache_iswa(
        const llama_model & model,
@ -51,6 +53,7 @@ llama_kv_cache_iswa::llama_kv_cache_iswa(

    //kcpp: pad the swa kv cache as well, similar to extra_context_handle_fragmentation
    size_swa += 128;
+    size_swa += kcpp_extra_swa_padding;
    size_swa = GGML_PAD(size_swa, n_pad);

    // when using full-size SWA cache, we set the SWA cache size to be equal to the base cache size
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@ -160,7 +160,7 @@ llama_kv_cache::llama_kv_cache(

    for (uint32_t il = 0; il < hparams.n_layer; il++) {
        if (!hparams.has_kv(il)) {
-            LLAMA_LOG_DEBUG("%s: layer %3d: does not have KV cache\n", __func__, il);
+            // LLAMA_LOG_DEBUG("%s: layer %3d: does not have KV cache\n", __func__, il);
            continue;
        }

@ -232,12 +232,12 @@ llama_kv_cache::llama_kv_cache(
            const int32_t il_reuse = reuse(il);

            if (il_reuse < 0) {
-                LLAMA_LOG_DEBUG("%s: - layer %3d: no reuse\n", __func__, il);
+                // LLAMA_LOG_DEBUG("%s: - layer %3d: no reuse\n", __func__, il);
                continue;
            }

            if (filter && !filter(il)) {
-                LLAMA_LOG_DEBUG("%s: - layer %3d: filtered\n", __func__, il);
+                // LLAMA_LOG_DEBUG("%s: - layer %3d: filtered\n", __func__, il);
                continue;
            }