From 9f4d0f6ccf481f1d92bc5453dc8feca891be665a Mon Sep 17 00:00:00 2001
From: Concedo <39025047+LostRuins@users.noreply.github.com>
Date: Mon, 21 Jul 2025 23:34:22 +0800
Subject: [PATCH] fixed swa pp bug by retrying smaller batches

---
 gpttype_adapter.cpp                 | 36 +++++++++++++++++++++--------
 otherarch/utils.cpp                 | 10 ++++++++
 otherarch/utils.h                   |  1 +
 src/llama-kv-cache-unified-iswa.cpp |  2 +-
 4 files changed, 39 insertions(+), 10 deletions(-)
diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp
index 145290d98..e267c5a81 100644
--- a/gpttype_adapter.cpp
+++ b/gpttype_adapter.cpp
@@ -2174,14 +2174,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
         llama_model_params model_params = llama_model_default_params();
         llama_context_params llama_ctx_params = llama_context_default_params();
         llama_ctx_params.n_ctx = clamped_max_context_length;
-        if(kcpp_data->use_contextshift)
-        {
-           llama_ctx_params.n_ctx += extra_context_handle_fragmentation;
-        }
-        else
-        {
-            llama_ctx_params.n_ctx += (extra_context_handle_fragmentation/2);
-        }
+        llama_ctx_params.n_ctx += extra_context_handle_fragmentation;
 
         llama_ctx_params.offload_kqv = !inputs.low_vram;
         llama_ctx_params.kv_unified = true;
@@ -3844,7 +3837,31 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
                 {
                     draft_used = false;
                     kcpp_embd_batch batch = kcpp_embd_batch(embd, n_past, use_mrope, false);
-                    evalres = (llama_decode(llama_ctx_v4, batch.batch)==0);
+                    int32_t decode_status = llama_decode(llama_ctx_v4, batch.batch);
+                    if(decode_status==1 && embd.size()>128)
+                    {
+                        printf("Couldn't find a big KV slot. Retry with smaller batch size of 128...\n");
+                        std::vector<std::vector<gpt_vocab::id>> parts = split_big_vector(embd,128);
+                        int temp_past = n_past;
+                        evalres = true;
+                        for(int p=0;p<parts.size();++p)
+                        {
+                            std::vector<gpt_vocab::id> chunk = parts[p];
+                            kcpp_embd_batch smallbatch = kcpp_embd_batch(chunk, temp_past, use_mrope, false);
+                            int32_t decode_status2 = llama_decode(llama_ctx_v4, smallbatch.batch);
+                            if(debugmode==1 && !is_quiet)
+                            {
+                                printf("Retry chunk: %d at %d... status: %s\n",chunk.size(),temp_past,(decode_status2==0?"ok":"fail"));
+                            }
+                            evalres = (evalres && (decode_status2==0));
+                            temp_past += chunk.size();
+                        }
+                    }
+                    else
+                    {
+                        evalres = (decode_status==0);
+                    }
+
                     if(draft_ctx)
                     {
                         evalres = (evalres && (llama_decode(draft_ctx, batch.batch)==0));
@@ -3928,6 +3945,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
             if (!evalres)
             {
                 fprintf(stderr, "\nFailed to predict at token position %d! Check your context buffer sizes!\n",n_past);
+                media_composite_image_signature = ""; //force invalidate
                 output.text = nullptr;
                 output.status = 0;
                 output.prompt_tokens = output.completion_tokens = 0;
diff --git a/otherarch/utils.cpp b/otherarch/utils.cpp
index 1a836c441..ece5373e5 100644
--- a/otherarch/utils.cpp
+++ b/otherarch/utils.cpp
@@ -356,6 +356,16 @@ std::string get_timestamp_str()
     return timestamp;
 }
 
+//split a big vector into multiple small vectors of chunk size or less
+std::vector<std::vector<int>> split_big_vector(const std::vector<int>& big_arr, size_t chunk_size) {
+    std::vector<std::vector<int>> small_arrs;
+    for (size_t i = 0; i < big_arr.size(); i += chunk_size) {
+        size_t end = std::min(i + chunk_size, big_arr.size());
+        small_arrs.emplace_back(big_arr.begin() + i, big_arr.begin() + end);
+    }
+    return small_arrs;
+}
+
 std::vector<float> resample_wav(const std::vector<float>& input, uint32_t input_rate, uint32_t output_rate) {
 
     size_t input_size = input.size();
diff --git a/otherarch/utils.h b/otherarch/utils.h
index 4cfb829f1..b4c17dc0d 100644
--- a/otherarch/utils.h
+++ b/otherarch/utils.h
@@ -61,6 +61,7 @@ std::string kcpp_base64_encode(const unsigned char* data, unsigned int data_leng
 std::string kcpp_base64_encode(const std::string &data);
 
 std::string get_timestamp_str();
+std::vector<std::vector<int>> split_big_vector(const std::vector<int>& big_arr, size_t chunk_size);
 std::vector<float> resample_wav(const std::vector<float>& input, uint32_t input_rate, uint32_t output_rate);
 
 int32_t kcpp_quick_sample(float * logits, const int n_logits, const std::vector<int32_t> & last_n_tokens, float rep_pen, float top_p, int top_k, float temp, std::mt19937 & rng);
diff --git a/src/llama-kv-cache-unified-iswa.cpp b/src/llama-kv-cache-unified-iswa.cpp
index 4c4e7cbae..9280a84a4 100644
--- a/src/llama-kv-cache-unified-iswa.cpp
+++ b/src/llama-kv-cache-unified-iswa.cpp
@@ -31,7 +31,7 @@ llama_kv_cache_unified_iswa::llama_kv_cache_unified_iswa(
     uint32_t size_swa = std::min(size_base, GGML_PAD(hparams.n_swa*(unified ? n_seq_max : 1) + n_ubatch, n_pad));
 
     //kcpp: pad the swa kv cache as well, similar to extra_context_handle_fragmentation
-    size_swa += 32;
+    size_swa += 128;
     size_swa = GGML_PAD(size_swa, n_pad);
 
     // when using full-size SWA cache, we set the SWA cache size to be equal to the base cache size