additional padding for the swa kv cache itself

This commit is contained in:
Concedo 2025-06-28 15:52:48 +08:00
parent ff2cabc28f
commit 2e14338455
2 changed files with 8 additions and 1 deletions

View file

@ -45,7 +45,7 @@
#include "common/common.h" #include "common/common.h"
//const //const
const int extra_context_handle_fragmentation = 120; const int extra_context_handle_fragmentation = 128;
const int LLAVA_TOKEN_IDENTIFIER_A = -998; //alternate between both, changing when image changes const int LLAVA_TOKEN_IDENTIFIER_A = -998; //alternate between both, changing when image changes
const int LLAVA_TOKEN_IDENTIFIER_B = -999; const int LLAVA_TOKEN_IDENTIFIER_B = -999;
@ -2174,6 +2174,10 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
{ {
llama_ctx_params.n_ctx += extra_context_handle_fragmentation; llama_ctx_params.n_ctx += extra_context_handle_fragmentation;
} }
else
{
llama_ctx_params.n_ctx += (extra_context_handle_fragmentation/2);
}
llama_ctx_params.offload_kqv = !inputs.low_vram; llama_ctx_params.offload_kqv = !inputs.low_vram;
model_params.use_mmap = inputs.use_mmap; model_params.use_mmap = inputs.use_mmap;

View file

@ -29,6 +29,9 @@ llama_kv_cache_unified_iswa::llama_kv_cache_unified_iswa(
uint32_t size_swa = std::min(size_base, GGML_PAD(hparams.n_swa*n_seq_max + n_ubatch, n_pad)); uint32_t size_swa = std::min(size_base, GGML_PAD(hparams.n_swa*n_seq_max + n_ubatch, n_pad));
//kcpp: pad the swa kv cache as well, similar to extra_context_handle_fragmentation
size_swa += 32;
// when using full-size SWA cache, we set the SWA cache size to be equal to the base cache size // when using full-size SWA cache, we set the SWA cache size to be equal to the base cache size
if (swa_full) { if (swa_full) {
LLAMA_LOG_WARN("%s: using full-size SWA cache (ref: %s)\n", LLAMA_LOG_WARN("%s: using full-size SWA cache (ref: %s)\n",