mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 09:34:37 +00:00
additional padding for the swa kv cache itself
This commit is contained in:
parent
ff2cabc28f
commit
2e14338455
2 changed files with 8 additions and 1 deletions
|
@ -45,7 +45,7 @@
|
||||||
#include "common/common.h"
|
#include "common/common.h"
|
||||||
|
|
||||||
//const
|
//const
|
||||||
const int extra_context_handle_fragmentation = 120;
|
const int extra_context_handle_fragmentation = 128;
|
||||||
const int LLAVA_TOKEN_IDENTIFIER_A = -998; //alternate between both, changing when image changes
|
const int LLAVA_TOKEN_IDENTIFIER_A = -998; //alternate between both, changing when image changes
|
||||||
const int LLAVA_TOKEN_IDENTIFIER_B = -999;
|
const int LLAVA_TOKEN_IDENTIFIER_B = -999;
|
||||||
|
|
||||||
|
@ -2174,6 +2174,10 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
||||||
{
|
{
|
||||||
llama_ctx_params.n_ctx += extra_context_handle_fragmentation;
|
llama_ctx_params.n_ctx += extra_context_handle_fragmentation;
|
||||||
}
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
llama_ctx_params.n_ctx += (extra_context_handle_fragmentation/2);
|
||||||
|
}
|
||||||
|
|
||||||
llama_ctx_params.offload_kqv = !inputs.low_vram;
|
llama_ctx_params.offload_kqv = !inputs.low_vram;
|
||||||
model_params.use_mmap = inputs.use_mmap;
|
model_params.use_mmap = inputs.use_mmap;
|
||||||
|
|
|
@ -29,6 +29,9 @@ llama_kv_cache_unified_iswa::llama_kv_cache_unified_iswa(
|
||||||
|
|
||||||
uint32_t size_swa = std::min(size_base, GGML_PAD(hparams.n_swa*n_seq_max + n_ubatch, n_pad));
|
uint32_t size_swa = std::min(size_base, GGML_PAD(hparams.n_swa*n_seq_max + n_ubatch, n_pad));
|
||||||
|
|
||||||
|
//kcpp: pad the swa kv cache as well, similar to extra_context_handle_fragmentation
|
||||||
|
size_swa += 32;
|
||||||
|
|
||||||
// when using full-size SWA cache, we set the SWA cache size to be equal to the base cache size
|
// when using full-size SWA cache, we set the SWA cache size to be equal to the base cache size
|
||||||
if (swa_full) {
|
if (swa_full) {
|
||||||
LLAMA_LOG_WARN("%s: using full-size SWA cache (ref: %s)\n",
|
LLAMA_LOG_WARN("%s: using full-size SWA cache (ref: %s)\n",
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue