added swa padding controls

This commit is contained in:
Concedo 2026-04-16 16:21:48 +08:00
parent a9e817fb4c
commit 0251c6dbde
5 changed files with 17 additions and 4 deletions

View file

@ -76,6 +76,7 @@ struct load_model_inputs
const bool check_slowness = false;
const bool highpriority = false;
const bool swa_support = false;
const int swa_padding = 0;
const bool smartcache = false;
const int smartcacheslots = 0;
const bool pipelineparallel = false;

View file

@ -2173,6 +2173,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
kcpp_data->use_fastforward = inputs.use_fastforward;
kcpp_data->smartcache = inputs.smartcache;
kcpp_data->swa_full = !inputs.swa_support;
kcpp_extra_swa_padding = inputs.swa_padding;
if (!kcpp_data->swa_full) {
if (inputs.use_contextshift) {
kcpp_data->swa_full = true; //cannot use SWA

View file

@ -63,6 +63,7 @@ overridekv_max = 16
default_autofit_padding = 1024
lora_filenames_max = 4
multiuser_concurrent_limit = 10
swa_padding_default = 1024
# abuse prevention
stop_token_max = 256
@ -260,6 +261,7 @@ class load_model_inputs(ctypes.Structure):
("check_slowness", ctypes.c_bool),
("highpriority", ctypes.c_bool),
("swa_support", ctypes.c_bool),
("swa_padding", ctypes.c_int),
("smartcache", ctypes.c_bool),
("smartcacheslots", ctypes.c_int),
("pipelineparallel", ctypes.c_bool),
@ -1943,6 +1945,7 @@ def load_model(model_filename):
inputs.check_slowness = (not args.highpriority and os.name == 'nt' and 'Intel' in platform.processor())
inputs.highpriority = args.highpriority
inputs.swa_support = args.useswa
inputs.swa_padding = args.swapadding
scint = int(args.smartcache)
inputs.smartcache = False if scint<=0 else True
sclimit = (savestate_limit_default if scint<=1 else scint)
@ -7245,6 +7248,7 @@ def show_gui():
contextshift_var = ctk.IntVar(value=1)
fastforward_var = ctk.IntVar(value=1)
swa_var = ctk.IntVar(value=0)
swa_padding_var = ctk.StringVar(value=str(swa_padding_default))
smartcache_var = ctk.IntVar(value=0)
smartcacheslots_var = ctk.StringVar(value=str(savestate_limit_default))
remotetunnel_var = ctk.IntVar(value=0)
@ -7960,7 +7964,8 @@ def show_gui():
smartcontextbox = makecheckbox(context_tab, "Use SmartContext", smartcontext_var, 1,tooltiptxt="Uses SmartContext. Now considered outdated and not recommended.\nCheck the wiki for more info.")
makecheckbox(context_tab, "Use ContextShift", contextshift_var, 2,tooltiptxt="Uses Context Shifting to reduce reprocessing.\nRecommended. Check the wiki for more info.", command=togglectxshift)
makecheckbox(context_tab, "Use FastForwarding", fastforward_var, 3,tooltiptxt="Use fast forwarding to recycle previous context (always reprocess if disabled).\nRecommended.", command=togglefastforward)
makecheckbox(context_tab, "Use Sliding Window Attention (SWA)", swa_var, 4,tooltiptxt="Allows Sliding Window Attention (SWA) KV Cache, which saves memory but cannot be used with context shifting.", command=toggleswa)
makecheckbox(context_tab, "Use SWA", swa_var, 4,tooltiptxt="Allows Sliding Window Attention (SWA) KV Cache, which saves memory but cannot be used with context shifting.", command=toggleswa)
swa_padding_entry,swa_padding_label = makelabelentry(context_tab,"SWA Padding Tokens:", swa_padding_var, 4, 50, padx=300,singleline=True,tooltip="If the SWA is too small, you can expand it with padding, allowing for greater distance context rewinds.",labelpadx=160)
makecheckbox(context_tab, "Use SmartCache", smartcache_var, 5,tooltiptxt="Enables intelligent context switching by saving KV cache snapshots to RAM. Requires fast forwarding.", command=togglesmartcache)
makelabelentry(context_tab, "CacheSlots:", smartcacheslots_var, row=5, padx=(300), singleline=True, tooltip="Number of slots for smartcache",labelpadx=(220))
@ -8301,6 +8306,7 @@ def show_gui():
args.noshift = contextshift_var.get()==0
args.nofastforward = fastforward_var.get()==0
args.useswa = swa_var.get()==1
args.swapadding = int(swa_padding_var.get()) if swa_padding_var.get()!="" else 0
args.smartcache = (0 if smartcache_var.get()!=1 else int(smartcacheslots_var.get()))
args.remotetunnel = remotetunnel_var.get()==1
args.foreground = keepforeground.get()==1
@ -8550,6 +8556,7 @@ def show_gui():
contextshift_var.set(0 if "noshift" in mydict and mydict["noshift"] else 1)
fastforward_var.set(0 if "nofastforward" in mydict and mydict["nofastforward"] else 1)
swa_var.set(1 if "useswa" in mydict and mydict["useswa"] else 0)
swa_padding_var.set(mydict["swapadding"] if ("swapadding" in mydict and mydict["swapadding"]) else 0)
smartcache_var.set(1 if "smartcache" in mydict and mydict["smartcache"] else 0)
smartcacheslots_var.set(mydict["smartcache"] if ("smartcache" in mydict and mydict["smartcache"] and int(mydict["smartcache"])>1) else savestate_limit_default)
remotetunnel_var.set(1 if "remotetunnel" in mydict and mydict["remotetunnel"] else 0)
@ -11133,6 +11140,7 @@ if __name__ == '__main__':
advparser.add_argument("--noshift","--no-context-shift", help="If set, do not attempt to Trim and Shift the GGUF context.", action='store_true')
advparser.add_argument("--nofastforward", help="If set, do not attempt to fast forward GGUF context (always reprocess). Will also enable noshift", action='store_true')
advparser.add_argument("--useswa", help="If set, allows Sliding Window Attention (SWA) KV Cache, which saves memory but cannot be used with context shifting.", action='store_true')
advparser.add_argument("--swapadding", help="How much extra to pad the SWA KV cache, this affects the rewind limit before reprocessing is forced.", type=int, default=swa_padding_default)
advparser.add_argument("--smartcache", help="Enables intelligent context switching by saving KV cache snapshots to RAM. Requires fast forwarding.", metavar=('limit'), nargs='?', const=1, type=int, default=0)
advparser.add_argument("--ropeconfig", help="If set, uses customized RoPE scaling from configured frequency scale and frequency base (e.g. --ropeconfig 0.25 10000). Otherwise, uses NTK-Aware scaling set automatically based on context size. For linear rope, simply set the freq-scale and ignore the freq-base",metavar=('[rope-freq-scale]', '[rope-freq-base]'), default=[0.0, 10000.0], type=float, nargs='+')
advparser.add_argument("--overridenativecontext", help="Overrides the native trained context of the loaded model with a custom value to be used for Rope scaling.",metavar=('[trained context]'), type=int, default=0)

View file

@ -10,6 +10,8 @@
//
// llama_kv_cache_iswa
//
//kcpp: use a global flag to adjust swa padding
static int kcpp_extra_swa_padding = 0;
llama_kv_cache_iswa::llama_kv_cache_iswa(
const llama_model & model,
@ -51,6 +53,7 @@ llama_kv_cache_iswa::llama_kv_cache_iswa(
//kcpp: pad the swa kv cache as well, similar to extra_context_handle_fragmentation
size_swa += 128;
size_swa += kcpp_extra_swa_padding;
size_swa = GGML_PAD(size_swa, n_pad);
// when using full-size SWA cache, we set the SWA cache size to be equal to the base cache size

View file

@ -160,7 +160,7 @@ llama_kv_cache::llama_kv_cache(
for (uint32_t il = 0; il < hparams.n_layer; il++) {
if (!hparams.has_kv(il)) {
LLAMA_LOG_DEBUG("%s: layer %3d: does not have KV cache\n", __func__, il);
// LLAMA_LOG_DEBUG("%s: layer %3d: does not have KV cache\n", __func__, il);
continue;
}
@ -232,12 +232,12 @@ llama_kv_cache::llama_kv_cache(
const int32_t il_reuse = reuse(il);
if (il_reuse < 0) {
LLAMA_LOG_DEBUG("%s: - layer %3d: no reuse\n", __func__, il);
// LLAMA_LOG_DEBUG("%s: - layer %3d: no reuse\n", __func__, il);
continue;
}
if (filter && !filter(il)) {
LLAMA_LOG_DEBUG("%s: - layer %3d: filtered\n", __func__, il);
// LLAMA_LOG_DEBUG("%s: - layer %3d: filtered\n", __func__, il);
continue;
}