diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index 8704193e8..92d353285 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -45,7 +45,7 @@ #include "common/common.h" //const -const int extra_context_handle_fragmentation = 120; +const int extra_context_handle_fragmentation = 128; const int LLAVA_TOKEN_IDENTIFIER_A = -998; //alternate between both, changing when image changes const int LLAVA_TOKEN_IDENTIFIER_B = -999; @@ -2174,6 +2174,10 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in { llama_ctx_params.n_ctx += extra_context_handle_fragmentation; } + else + { + llama_ctx_params.n_ctx += (extra_context_handle_fragmentation/2); + } llama_ctx_params.offload_kqv = !inputs.low_vram; model_params.use_mmap = inputs.use_mmap; diff --git a/src/llama-kv-cache-unified-iswa.cpp b/src/llama-kv-cache-unified-iswa.cpp index b9169299c..4852af80a 100644 --- a/src/llama-kv-cache-unified-iswa.cpp +++ b/src/llama-kv-cache-unified-iswa.cpp @@ -29,6 +29,9 @@ llama_kv_cache_unified_iswa::llama_kv_cache_unified_iswa( uint32_t size_swa = std::min(size_base, GGML_PAD(hparams.n_swa*n_seq_max + n_ubatch, n_pad)); + //kcpp: pad the swa kv cache as well, similar to extra_context_handle_fragmentation + size_swa += 32; + // when using full-size SWA cache, we set the SWA cache size to be equal to the base cache size if (swa_full) { LLAMA_LOG_WARN("%s: using full-size SWA cache (ref: %s)\n",