mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-05-18 23:49:46 +00:00
added swa padding controls
This commit is contained in:
parent
a9e817fb4c
commit
0251c6dbde
5 changed files with 17 additions and 4 deletions
1
expose.h
1
expose.h
|
|
@ -76,6 +76,7 @@ struct load_model_inputs
|
|||
const bool check_slowness = false;
|
||||
const bool highpriority = false;
|
||||
const bool swa_support = false;
|
||||
const int swa_padding = 0;
|
||||
const bool smartcache = false;
|
||||
const int smartcacheslots = 0;
|
||||
const bool pipelineparallel = false;
|
||||
|
|
|
|||
|
|
@ -2173,6 +2173,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
|||
kcpp_data->use_fastforward = inputs.use_fastforward;
|
||||
kcpp_data->smartcache = inputs.smartcache;
|
||||
kcpp_data->swa_full = !inputs.swa_support;
|
||||
kcpp_extra_swa_padding = inputs.swa_padding;
|
||||
if (!kcpp_data->swa_full) {
|
||||
if (inputs.use_contextshift) {
|
||||
kcpp_data->swa_full = true; //cannot use SWA
|
||||
|
|
|
|||
10
koboldcpp.py
10
koboldcpp.py
|
|
@ -63,6 +63,7 @@ overridekv_max = 16
|
|||
default_autofit_padding = 1024
|
||||
lora_filenames_max = 4
|
||||
multiuser_concurrent_limit = 10
|
||||
swa_padding_default = 1024
|
||||
|
||||
# abuse prevention
|
||||
stop_token_max = 256
|
||||
|
|
@ -260,6 +261,7 @@ class load_model_inputs(ctypes.Structure):
|
|||
("check_slowness", ctypes.c_bool),
|
||||
("highpriority", ctypes.c_bool),
|
||||
("swa_support", ctypes.c_bool),
|
||||
("swa_padding", ctypes.c_int),
|
||||
("smartcache", ctypes.c_bool),
|
||||
("smartcacheslots", ctypes.c_int),
|
||||
("pipelineparallel", ctypes.c_bool),
|
||||
|
|
@ -1943,6 +1945,7 @@ def load_model(model_filename):
|
|||
inputs.check_slowness = (not args.highpriority and os.name == 'nt' and 'Intel' in platform.processor())
|
||||
inputs.highpriority = args.highpriority
|
||||
inputs.swa_support = args.useswa
|
||||
inputs.swa_padding = args.swapadding
|
||||
scint = int(args.smartcache)
|
||||
inputs.smartcache = False if scint<=0 else True
|
||||
sclimit = (savestate_limit_default if scint<=1 else scint)
|
||||
|
|
@ -7245,6 +7248,7 @@ def show_gui():
|
|||
contextshift_var = ctk.IntVar(value=1)
|
||||
fastforward_var = ctk.IntVar(value=1)
|
||||
swa_var = ctk.IntVar(value=0)
|
||||
swa_padding_var = ctk.StringVar(value=str(swa_padding_default))
|
||||
smartcache_var = ctk.IntVar(value=0)
|
||||
smartcacheslots_var = ctk.StringVar(value=str(savestate_limit_default))
|
||||
remotetunnel_var = ctk.IntVar(value=0)
|
||||
|
|
@ -7960,7 +7964,8 @@ def show_gui():
|
|||
smartcontextbox = makecheckbox(context_tab, "Use SmartContext", smartcontext_var, 1,tooltiptxt="Uses SmartContext. Now considered outdated and not recommended.\nCheck the wiki for more info.")
|
||||
makecheckbox(context_tab, "Use ContextShift", contextshift_var, 2,tooltiptxt="Uses Context Shifting to reduce reprocessing.\nRecommended. Check the wiki for more info.", command=togglectxshift)
|
||||
makecheckbox(context_tab, "Use FastForwarding", fastforward_var, 3,tooltiptxt="Use fast forwarding to recycle previous context (always reprocess if disabled).\nRecommended.", command=togglefastforward)
|
||||
makecheckbox(context_tab, "Use Sliding Window Attention (SWA)", swa_var, 4,tooltiptxt="Allows Sliding Window Attention (SWA) KV Cache, which saves memory but cannot be used with context shifting.", command=toggleswa)
|
||||
makecheckbox(context_tab, "Use SWA", swa_var, 4,tooltiptxt="Allows Sliding Window Attention (SWA) KV Cache, which saves memory but cannot be used with context shifting.", command=toggleswa)
|
||||
swa_padding_entry,swa_padding_label = makelabelentry(context_tab,"SWA Padding Tokens:", swa_padding_var, 4, 50, padx=300,singleline=True,tooltip="If the SWA is too small, you can expand it with padding, allowing for greater distance context rewinds.",labelpadx=160)
|
||||
makecheckbox(context_tab, "Use SmartCache", smartcache_var, 5,tooltiptxt="Enables intelligent context switching by saving KV cache snapshots to RAM. Requires fast forwarding.", command=togglesmartcache)
|
||||
makelabelentry(context_tab, "CacheSlots:", smartcacheslots_var, row=5, padx=(300), singleline=True, tooltip="Number of slots for smartcache",labelpadx=(220))
|
||||
|
||||
|
|
@ -8301,6 +8306,7 @@ def show_gui():
|
|||
args.noshift = contextshift_var.get()==0
|
||||
args.nofastforward = fastforward_var.get()==0
|
||||
args.useswa = swa_var.get()==1
|
||||
args.swapadding = int(swa_padding_var.get()) if swa_padding_var.get()!="" else 0
|
||||
args.smartcache = (0 if smartcache_var.get()!=1 else int(smartcacheslots_var.get()))
|
||||
args.remotetunnel = remotetunnel_var.get()==1
|
||||
args.foreground = keepforeground.get()==1
|
||||
|
|
@ -8550,6 +8556,7 @@ def show_gui():
|
|||
contextshift_var.set(0 if "noshift" in mydict and mydict["noshift"] else 1)
|
||||
fastforward_var.set(0 if "nofastforward" in mydict and mydict["nofastforward"] else 1)
|
||||
swa_var.set(1 if "useswa" in mydict and mydict["useswa"] else 0)
|
||||
swa_padding_var.set(mydict["swapadding"] if ("swapadding" in mydict and mydict["swapadding"]) else 0)
|
||||
smartcache_var.set(1 if "smartcache" in mydict and mydict["smartcache"] else 0)
|
||||
smartcacheslots_var.set(mydict["smartcache"] if ("smartcache" in mydict and mydict["smartcache"] and int(mydict["smartcache"])>1) else savestate_limit_default)
|
||||
remotetunnel_var.set(1 if "remotetunnel" in mydict and mydict["remotetunnel"] else 0)
|
||||
|
|
@ -11133,6 +11140,7 @@ if __name__ == '__main__':
|
|||
advparser.add_argument("--noshift","--no-context-shift", help="If set, do not attempt to Trim and Shift the GGUF context.", action='store_true')
|
||||
advparser.add_argument("--nofastforward", help="If set, do not attempt to fast forward GGUF context (always reprocess). Will also enable noshift", action='store_true')
|
||||
advparser.add_argument("--useswa", help="If set, allows Sliding Window Attention (SWA) KV Cache, which saves memory but cannot be used with context shifting.", action='store_true')
|
||||
advparser.add_argument("--swapadding", help="How much extra to pad the SWA KV cache, this affects the rewind limit before reprocessing is forced.", type=int, default=swa_padding_default)
|
||||
advparser.add_argument("--smartcache", help="Enables intelligent context switching by saving KV cache snapshots to RAM. Requires fast forwarding.", metavar=('limit'), nargs='?', const=1, type=int, default=0)
|
||||
advparser.add_argument("--ropeconfig", help="If set, uses customized RoPE scaling from configured frequency scale and frequency base (e.g. --ropeconfig 0.25 10000). Otherwise, uses NTK-Aware scaling set automatically based on context size. For linear rope, simply set the freq-scale and ignore the freq-base",metavar=('[rope-freq-scale]', '[rope-freq-base]'), default=[0.0, 10000.0], type=float, nargs='+')
|
||||
advparser.add_argument("--overridenativecontext", help="Overrides the native trained context of the loaded model with a custom value to be used for Rope scaling.",metavar=('[trained context]'), type=int, default=0)
|
||||
|
|
|
|||
|
|
@ -10,6 +10,8 @@
|
|||
//
|
||||
// llama_kv_cache_iswa
|
||||
//
|
||||
//kcpp: use a global flag to adjust swa padding
|
||||
static int kcpp_extra_swa_padding = 0;
|
||||
|
||||
llama_kv_cache_iswa::llama_kv_cache_iswa(
|
||||
const llama_model & model,
|
||||
|
|
@ -51,6 +53,7 @@ llama_kv_cache_iswa::llama_kv_cache_iswa(
|
|||
|
||||
//kcpp: pad the swa kv cache as well, similar to extra_context_handle_fragmentation
|
||||
size_swa += 128;
|
||||
size_swa += kcpp_extra_swa_padding;
|
||||
size_swa = GGML_PAD(size_swa, n_pad);
|
||||
|
||||
// when using full-size SWA cache, we set the SWA cache size to be equal to the base cache size
|
||||
|
|
|
|||
|
|
@ -160,7 +160,7 @@ llama_kv_cache::llama_kv_cache(
|
|||
|
||||
for (uint32_t il = 0; il < hparams.n_layer; il++) {
|
||||
if (!hparams.has_kv(il)) {
|
||||
LLAMA_LOG_DEBUG("%s: layer %3d: does not have KV cache\n", __func__, il);
|
||||
// LLAMA_LOG_DEBUG("%s: layer %3d: does not have KV cache\n", __func__, il);
|
||||
continue;
|
||||
}
|
||||
|
||||
|
|
@ -232,12 +232,12 @@ llama_kv_cache::llama_kv_cache(
|
|||
const int32_t il_reuse = reuse(il);
|
||||
|
||||
if (il_reuse < 0) {
|
||||
LLAMA_LOG_DEBUG("%s: - layer %3d: no reuse\n", __func__, il);
|
||||
// LLAMA_LOG_DEBUG("%s: - layer %3d: no reuse\n", __func__, il);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (filter && !filter(il)) {
|
||||
LLAMA_LOG_DEBUG("%s: - layer %3d: filtered\n", __func__, il);
|
||||
// LLAMA_LOG_DEBUG("%s: - layer %3d: filtered\n", __func__, il);
|
||||
continue;
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue