From 9f4d0f6ccf481f1d92bc5453dc8feca891be665a Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Mon, 21 Jul 2025 23:34:22 +0800 Subject: [PATCH] fixed swa pp bug by retrying smaller batches --- gpttype_adapter.cpp | 36 +++++++++++++++++++++-------- otherarch/utils.cpp | 10 ++++++++ otherarch/utils.h | 1 + src/llama-kv-cache-unified-iswa.cpp | 2 +- 4 files changed, 39 insertions(+), 10 deletions(-) diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index 145290d98..e267c5a81 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -2174,14 +2174,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in llama_model_params model_params = llama_model_default_params(); llama_context_params llama_ctx_params = llama_context_default_params(); llama_ctx_params.n_ctx = clamped_max_context_length; - if(kcpp_data->use_contextshift) - { - llama_ctx_params.n_ctx += extra_context_handle_fragmentation; - } - else - { - llama_ctx_params.n_ctx += (extra_context_handle_fragmentation/2); - } + llama_ctx_params.n_ctx += extra_context_handle_fragmentation; llama_ctx_params.offload_kqv = !inputs.low_vram; llama_ctx_params.kv_unified = true; @@ -3844,7 +3837,31 @@ generation_outputs gpttype_generate(const generation_inputs inputs) { draft_used = false; kcpp_embd_batch batch = kcpp_embd_batch(embd, n_past, use_mrope, false); - evalres = (llama_decode(llama_ctx_v4, batch.batch)==0); + int32_t decode_status = llama_decode(llama_ctx_v4, batch.batch); + if(decode_status==1 && embd.size()>128) + { + printf("Couldn't find a big KV slot. Retry with smaller batch size of 128...\n"); + std::vector> parts = split_big_vector(embd,128); + int temp_past = n_past; + evalres = true; + for(int p=0;p chunk = parts[p]; + kcpp_embd_batch smallbatch = kcpp_embd_batch(chunk, temp_past, use_mrope, false); + int32_t decode_status2 = llama_decode(llama_ctx_v4, smallbatch.batch); + if(debugmode==1 && !is_quiet) + { + printf("Retry chunk: %d at %d... status: %s\n",chunk.size(),temp_past,(decode_status2==0?"ok":"fail")); + } + evalres = (evalres && (decode_status2==0)); + temp_past += chunk.size(); + } + } + else + { + evalres = (decode_status==0); + } + if(draft_ctx) { evalres = (evalres && (llama_decode(draft_ctx, batch.batch)==0)); @@ -3928,6 +3945,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs) if (!evalres) { fprintf(stderr, "\nFailed to predict at token position %d! Check your context buffer sizes!\n",n_past); + media_composite_image_signature = ""; //force invalidate output.text = nullptr; output.status = 0; output.prompt_tokens = output.completion_tokens = 0; diff --git a/otherarch/utils.cpp b/otherarch/utils.cpp index 1a836c441..ece5373e5 100644 --- a/otherarch/utils.cpp +++ b/otherarch/utils.cpp @@ -356,6 +356,16 @@ std::string get_timestamp_str() return timestamp; } +//split a big vector into multiple small vectors of chunk size or less +std::vector> split_big_vector(const std::vector& big_arr, size_t chunk_size) { + std::vector> small_arrs; + for (size_t i = 0; i < big_arr.size(); i += chunk_size) { + size_t end = std::min(i + chunk_size, big_arr.size()); + small_arrs.emplace_back(big_arr.begin() + i, big_arr.begin() + end); + } + return small_arrs; +} + std::vector resample_wav(const std::vector& input, uint32_t input_rate, uint32_t output_rate) { size_t input_size = input.size(); diff --git a/otherarch/utils.h b/otherarch/utils.h index 4cfb829f1..b4c17dc0d 100644 --- a/otherarch/utils.h +++ b/otherarch/utils.h @@ -61,6 +61,7 @@ std::string kcpp_base64_encode(const unsigned char* data, unsigned int data_leng std::string kcpp_base64_encode(const std::string &data); std::string get_timestamp_str(); +std::vector> split_big_vector(const std::vector& big_arr, size_t chunk_size); std::vector resample_wav(const std::vector& input, uint32_t input_rate, uint32_t output_rate); int32_t kcpp_quick_sample(float * logits, const int n_logits, const std::vector & last_n_tokens, float rep_pen, float top_p, int top_k, float temp, std::mt19937 & rng); diff --git a/src/llama-kv-cache-unified-iswa.cpp b/src/llama-kv-cache-unified-iswa.cpp index 4c4e7cbae..9280a84a4 100644 --- a/src/llama-kv-cache-unified-iswa.cpp +++ b/src/llama-kv-cache-unified-iswa.cpp @@ -31,7 +31,7 @@ llama_kv_cache_unified_iswa::llama_kv_cache_unified_iswa( uint32_t size_swa = std::min(size_base, GGML_PAD(hparams.n_swa*(unified ? n_seq_max : 1) + n_ubatch, n_pad)); //kcpp: pad the swa kv cache as well, similar to extra_context_handle_fragmentation - size_swa += 32; + size_swa += 128; size_swa = GGML_PAD(size_swa, n_pad); // when using full-size SWA cache, we set the SWA cache size to be equal to the base cache size